Skip to content

Commit 26463e2

Browse files
refactor : separate UriMatchProcessor
1 parent 303e116 commit 26463e2

File tree

14 files changed

+125
-119
lines changed

14 files changed

+125
-119
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ For ES5 users,
1717
<body>
1818
<script src="../dist/url-knife.bundle.js"></script>
1919
<--! OR !-->
20-
<script src="https://cdn.jsdelivr.net/gh/patternknife/url-knife@4.0.1/dist/url-knife.bundle.min.js"></script>
20+
<script src="https://cdn.jsdelivr.net/gh/patternknife/url-knife@4.1.2/dist/url-knife.bundle.min.js"></script>
2121

2222
<script type="text/javascript">
2323
</script>

dist/api/TextAreaApi.js

Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Object.defineProperty(exports, "__esModule", { value: true });
33
exports.TextAreaApi = void 0;
44
const SafeConditionalUrlPatternBuilder_1 = require("../pattern/SafeConditionalUrlPatternBuilder");
5-
const DomainPatterns_1 = require("../pattern/DomainPatterns");
65
const TextAreaService_1 = require("../service/TextAreaService");
6+
const UriMatchProcessor_1 = require("../bo/UriMatchProcessor");
77
exports.TextAreaApi = {
88
/**
99
* @brief
@@ -58,42 +58,8 @@ exports.TextAreaApi = {
5858
if (!(textStr && typeof textStr === 'string')) {
5959
throw new Error('the variable textStr must be a string type and not be null.');
6060
}
61-
let obj = TextAreaService_1.TextAreaService.extractCertainPureUris(textStr, uris, endBoundary);
62-
let obj2 = TextAreaService_1.TextAreaService.extractAllPureUrls(textStr);
63-
//console.log('obj : ' + JSON.stringify(obj));
64-
let obj_final = [];
65-
for (let a = 0; a < obj.length; a++) {
66-
let obj_part = {
67-
uriDetected: undefined,
68-
inWhatUrl: undefined,
69-
};
70-
//let matchedUrlFound = false;
71-
for (let b = 0; b < obj2.length; b++) {
72-
if ((obj[a].index.start > obj2[b].index.start && obj[a].index.start < obj2[b].index.end)
73-
&&
74-
(obj[a].index.end > obj2[b].index.start && obj[a].index.end <= obj2[b].index.end)) {
75-
// Here, the uri detected is inside its url
76-
// false positives like the example '//google.com/abc/def?a=5&b=7' can be detected in 'Service.Text.extractCertainPureUris'
77-
let sanitizedUrl = obj[a]['value']['url'] || "";
78-
let rx = new RegExp('^(\\/\\/[^/]*|\\/[^\\s]+\\.' + DomainPatterns_1.DomainPatterns.allRootDomains + ')', 'gi');
79-
let matches = [];
80-
let match;
81-
while ((match = rx.exec(obj[a].value.url || "")) !== null) {
82-
if (match[1]) {
83-
sanitizedUrl = sanitizedUrl.replace(rx, '');
84-
//console.log(match[1]);
85-
obj[a].value.url = sanitizedUrl;
86-
obj[a].index.start += match[1].length;
87-
obj[a].value.onlyUriWithParams = obj[a].value.url;
88-
obj[a].value.onlyUri = (obj[a].value.url || "").replace(/\?[^/]*$/gi, '');
89-
}
90-
}
91-
obj_part.inWhatUrl = obj2[b];
92-
}
93-
}
94-
obj_part.uriDetected = obj[a];
95-
obj_final.push(obj_part);
96-
}
97-
return obj_final;
61+
let uriMatchList = TextAreaService_1.TextAreaService.extractCertainPureUris(textStr, uris, endBoundary);
62+
let urlMatchList = TextAreaService_1.TextAreaService.extractAllPureUrls(textStr);
63+
return (0, UriMatchProcessor_1.processAllUriMatches)(uriMatchList, urlMatchList);
9864
},
9965
};

dist/api/XmlAreaApi.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ const EmailPatternBuilder_1 = require("../pattern/EmailPatternBuilder");
77
const BasePatterns_1 = require("../pattern/BasePatterns");
88
const XmlAreaService_1 = require("../service/XmlAreaService");
99
const UrlAreaService_1 = require("../service/UrlAreaService");
10+
/*
11+
* The XmlArea should be refactored, and text codes should be included.
12+
* */
1013
exports.XmlAreaApi = {
1114
/**
1215
*
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
44
};
55
Object.defineProperty(exports, "__esModule", { value: true });
66
exports.Normalizer = void 0;
7-
const util_1 = __importDefault(require("./util"));
8-
const FuzzyPartialUrlPatterns_1 = require("./pattern/FuzzyPartialUrlPatterns");
9-
const BasePatterns_1 = require("./pattern/BasePatterns");
10-
const ProtocolPatterns_1 = require("./pattern/ProtocolPatterns");
11-
const DomainPatterns_1 = require("./pattern/DomainPatterns");
7+
const util_1 = __importDefault(require("../util"));
8+
const FuzzyPartialUrlPatterns_1 = require("../pattern/FuzzyPartialUrlPatterns");
9+
const BasePatterns_1 = require("../pattern/BasePatterns");
10+
const ProtocolPatterns_1 = require("../pattern/ProtocolPatterns");
11+
const DomainPatterns_1 = require("../pattern/DomainPatterns");
1212
exports.Normalizer = {
1313
modifiedUrl: null,
1414
extractAndNormalizeProtocolFromSpacesRemovedUrl() {

dist/bo/UriMatchProcessor.js

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"use strict";
2+
Object.defineProperty(exports, "__esModule", { value: true });
3+
exports.processUriMatchInIndexRange = processUriMatchInIndexRange;
4+
exports.processAllUriMatches = processAllUriMatches;
5+
const DomainPatterns_1 = require("../pattern/DomainPatterns");
6+
function processUriMatchInIndexRange(uriMatch, urlMatchList) {
7+
let obj_part = {
8+
uriDetected: undefined,
9+
inWhatUrl: undefined,
10+
};
11+
for (let i = 0; i < urlMatchList.length; i++) {
12+
if (uriMatch.index.start > urlMatchList[i].index.start &&
13+
uriMatch.index.start < urlMatchList[i].index.end &&
14+
uriMatch.index.end > urlMatchList[i].index.start &&
15+
uriMatch.index.end <= urlMatchList[i].index.end) {
16+
let sanitizedUrl = uriMatch.value.url || "";
17+
let rx = new RegExp("^(\\/\\/[^/]*|\\/[^\\s]+\\." + DomainPatterns_1.DomainPatterns.allRootDomains + ")", "gi");
18+
let match;
19+
while ((match = rx.exec(uriMatch.value.url || "")) !== null) {
20+
if (match[1]) {
21+
sanitizedUrl = sanitizedUrl.replace(rx, "");
22+
uriMatch.value.url = sanitizedUrl;
23+
uriMatch.index.start += match[1].length;
24+
uriMatch.value.onlyUriWithParams = uriMatch.value.url;
25+
uriMatch.value.onlyUri = (uriMatch.value.url || "").replace(/\?[^/]*$/gi, "");
26+
}
27+
}
28+
obj_part.inWhatUrl = urlMatchList[i];
29+
}
30+
}
31+
obj_part.uriDetected = uriMatch;
32+
return obj_part;
33+
}
34+
function processAllUriMatches(uriMatchList, urlMatchList) {
35+
return uriMatchList.map((uriMatch) => processUriMatchInIndexRange(uriMatch, urlMatchList));
36+
}

dist/service/UrlAreaService.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
55
Object.defineProperty(exports, "__esModule", { value: true });
66
exports.UrlAreaService = void 0;
77
const valid_1 = __importDefault(require("../valid"));
8-
const normalizer_1 = require("../normalizer");
98
const util_1 = __importDefault(require("../util"));
109
const BasePatterns_1 = require("../pattern/BasePatterns");
1110
const ProtocolPatterns_1 = require("../pattern/ProtocolPatterns");
1211
const DomainPatterns_1 = require("../pattern/DomainPatterns");
1312
const SafeConditionalUrlPatternBuilder_1 = require("../pattern/SafeConditionalUrlPatternBuilder");
13+
const Normalizer_1 = require("../bo/Normalizer");
1414
const queryString = require('query-string');
1515
exports.UrlAreaService = {
1616
/**
@@ -37,21 +37,21 @@ exports.UrlAreaService = {
3737
try {
3838
url = valid_1.default.validateAndTrimString(url);
3939
/* Chapter 1. Normalizing process */
40-
normalizer_1.Normalizer.modifiedUrl = util_1.default.Text.removeAllSpaces(url);
40+
Normalizer_1.Normalizer.modifiedUrl = util_1.default.Text.removeAllSpaces(url);
4141
// 1. full url
4242
obj.url = url;
4343
// 2. protocol
44-
obj.protocol = normalizer_1.Normalizer.extractAndNormalizeProtocolFromSpacesRemovedUrl();
44+
obj.protocol = Normalizer_1.Normalizer.extractAndNormalizeProtocolFromSpacesRemovedUrl();
4545
// 3. Domain
46-
let domainWithType = normalizer_1.Normalizer.extractAndNormalizeDomainFromProtocolRemovedUrl();
46+
let domainWithType = Normalizer_1.Normalizer.extractAndNormalizeDomainFromProtocolRemovedUrl();
4747
obj.type = domainWithType.type;
4848
obj.onlyDomain = domainWithType.domain;
4949
// 4. Port
50-
obj.port = normalizer_1.Normalizer.extractAndNormalizePortFromDomainRemovedUrl();
50+
obj.port = Normalizer_1.Normalizer.extractAndNormalizePortFromDomainRemovedUrl();
5151
// 5. Finalize
52-
obj.normalizedUrl = normalizer_1.Normalizer.finalizeNormalization(obj.protocol, obj.port, obj.onlyDomain);
52+
obj.normalizedUrl = Normalizer_1.Normalizer.finalizeNormalization(obj.protocol, obj.port, obj.onlyDomain);
5353
// 6. Params & URI
54-
let uriParams = normalizer_1.Normalizer.extractAndNormalizeUriParamsFromPortRemovedUrl();
54+
let uriParams = Normalizer_1.Normalizer.extractAndNormalizeUriParamsFromPortRemovedUrl();
5555
obj.onlyUri = uriParams.uri;
5656
obj.onlyParams = uriParams.params;
5757
/* Chapter 2. Post normalizing process (same as the function 'parseUrl')*/

dist/url-knife.bundle.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "url-knife",
3-
"version": "4.1.1",
3+
"version": "4.1.2",
44
"description": "Extract and decompose (fuzzy) URLs (including emails, which are conceptually a part of URLs) in texts with robust patterns.",
55
"main": "src/entry.ts",
66
"scripts": {

src/api/TextAreaApi.ts

Lines changed: 4 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import {EmailMatch, ExtractCertainUriMatch, IndexContainingBaseMatch, NoProtocolJsnParamType} from "../types";
22
import {SafeConditionalUrlPatternBuilder} from "../pattern/SafeConditionalUrlPatternBuilder";
3-
4-
import {DomainPatterns} from "../pattern/DomainPatterns";
53
import {TextAreaService} from "../service/TextAreaService";
4+
import {processAllUriMatches} from "../bo/UriMatchProcessor";
65

76
export const TextAreaApi = {
87

@@ -69,61 +68,10 @@ export const TextAreaApi = {
6968
throw new Error('the variable textStr must be a string type and not be null.');
7069
}
7170

72-
let obj: IndexContainingBaseMatch[] = TextAreaService.extractCertainPureUris(textStr, uris, endBoundary);
73-
let obj2: IndexContainingBaseMatch[] = TextAreaService.extractAllPureUrls(textStr);
74-
75-
76-
//console.log('obj : ' + JSON.stringify(obj));
77-
78-
let obj_final = [];
79-
80-
for (let a = 0; a < obj.length; a++) {
81-
82-
let obj_part: ExtractCertainUriMatch = {
83-
uriDetected: undefined,
84-
inWhatUrl: undefined,
85-
};
86-
87-
//let matchedUrlFound = false;
88-
for (let b = 0; b < obj2.length; b++) {
89-
90-
if ((obj[a].index.start > obj2[b].index.start && obj[a].index.start < obj2[b].index.end)
91-
&&
92-
(obj[a].index.end > obj2[b].index.start && obj[a].index.end <= obj2[b].index.end)) {
93-
94-
// Here, the uri detected is inside its url
95-
// false positives like the example '//google.com/abc/def?a=5&b=7' can be detected in 'Service.Text.extractCertainPureUris'
96-
97-
let sanitizedUrl = obj[a]['value']['url'] || "";
98-
99-
let rx = new RegExp('^(\\/\\/[^/]*|\\/[^\\s]+\\.' + DomainPatterns.allRootDomains + ')', 'gi');
100-
let matches = [];
101-
let match: RegExpExecArray | null;
102-
103-
while ((match = rx.exec(obj[a].value.url || "")) !== null) {
104-
if (match[1]) {
105-
106-
sanitizedUrl = sanitizedUrl.replace(rx, '');
107-
108-
//console.log(match[1]);
109-
110-
obj[a].value.url = sanitizedUrl;
111-
obj[a].index.start += match[1].length;
112-
113-
obj[a].value.onlyUriWithParams = obj[a].value.url;
114-
obj[a].value.onlyUri = (obj[a].value.url || "").replace(/\?[^/]*$/gi, '');
115-
}
116-
}
117-
obj_part.inWhatUrl = obj2[b];
118-
}
119-
120-
}
121-
122-
obj_part.uriDetected = obj[a];
123-
obj_final.push(obj_part);
124-
}
71+
let uriMatchList: IndexContainingBaseMatch[] = TextAreaService.extractCertainPureUris(textStr, uris, endBoundary);
72+
let urlMatchList: IndexContainingBaseMatch[] = TextAreaService.extractAllPureUrls(textStr);
12573

126-
return obj_final;
74+
return processAllUriMatches(uriMatchList, urlMatchList);
12775

12876
},
12977

src/api/XmlAreaApi.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ import {BasePatterns} from "../pattern/BasePatterns";
66
import {XmlAreaService} from "../service/XmlAreaService";
77
import {UrlAreaService} from "../service/UrlAreaService";
88

9+
/*
10+
* The XmlArea should be refactored, and text codes should be included.
11+
* */
912
export const XmlAreaApi = {
1013

1114

0 commit comments

Comments
 (0)