Skip to content

Commit c81ec96

Browse files
refactor : TextArea
1 parent e3962f3 commit c81ec96

19 files changed

+252
-244
lines changed

dist/api/TextAreaApi.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ exports.TextAreaApi = {
2727
intranet: false
2828
}) {
2929
SafeConditionalUrlPatternBuilder_1.SafeConditionalUrlPatternBuilder.setUrlPattern(noProtocolJsn);
30-
return TextAreaService_1.TextAreaService.extractAllPureUrls(textStr);
30+
return TextAreaService_1.TextAreaService.extractAllUrlMatchList(textStr);
3131
},
3232
/**
3333
* @brief
@@ -38,7 +38,7 @@ exports.TextAreaApi = {
3838
* @return array
3939
*/
4040
extractAllEmails(textStr, prefixSanitizer = true) {
41-
return TextAreaService_1.TextAreaService.extractAllPureEmails(textStr, prefixSanitizer);
41+
return TextAreaService_1.TextAreaService.extractAllEmailMatchList(textStr, prefixSanitizer);
4242
},
4343
/**
4444
*
@@ -58,8 +58,8 @@ exports.TextAreaApi = {
5858
if (!(textStr && typeof textStr === 'string')) {
5959
throw new Error('the variable textStr must be a string type and not be null.');
6060
}
61-
let uriMatchList = TextAreaService_1.TextAreaService.extractCertainPureUris(textStr, uris, endBoundary);
62-
let urlMatchList = TextAreaService_1.TextAreaService.extractAllPureUrls(textStr);
61+
let uriMatchList = TextAreaService_1.TextAreaService.extractCertainUriMatchList(textStr, uris, endBoundary);
62+
let urlMatchList = TextAreaService_1.TextAreaService.extractAllUrlMatchList(textStr);
6363
return (0, UriMatchProcessor_1.processAllUriMatches)(uriMatchList, urlMatchList);
6464
},
6565
};

dist/bo/EmailMatchProcessor.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"use strict";
2+
Object.defineProperty(exports, "__esModule", { value: true });
3+
exports.sanitizeEmailPrefix = sanitizeEmailPrefix;
4+
/**
5+
* Sanitizes the prefix of the email based on specific patterns.
6+
* @param matchedEmailFront - The part of the email before the "@" symbol.
7+
* @param matchedEmail - The full email match.
8+
* @returns An object containing the sanitized email, border, and removed length.
9+
*/
10+
function sanitizeEmailPrefix(matchedEmailFront, matchedEmail) {
11+
let border = '';
12+
let removedLength = 0;
13+
const rxLeftPlusBorder = new RegExp('^([^a-zA-Z0-9]+)([a-zA-Z0-9])', '');
14+
let isModValFrontOnlyForeignLang = true;
15+
const match = rxLeftPlusBorder.exec(matchedEmailFront);
16+
if (match !== null) {
17+
isModValFrontOnlyForeignLang = false;
18+
if (match[1]) {
19+
removedLength = match[1].length;
20+
}
21+
if (match[2]) {
22+
border = match[2];
23+
}
24+
}
25+
if (!isModValFrontOnlyForeignLang) {
26+
matchedEmail = matchedEmail.replace(rxLeftPlusBorder, '');
27+
matchedEmail = border + matchedEmail;
28+
}
29+
return {
30+
sanitizedEmail: matchedEmail,
31+
removedLength: removedLength,
32+
};
33+
}

dist/bo/UriMatchProcessor.js

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"use strict";
22
Object.defineProperty(exports, "__esModule", { value: true });
3-
exports.processUriMatchInIndexRange = processUriMatchInIndexRange;
43
exports.processAllUriMatches = processAllUriMatches;
4+
exports.adjustUrisRx = adjustUrisRx;
55
const DomainPatterns_1 = require("../pattern/DomainPatterns");
6+
const BasePatterns_1 = require("../pattern/BasePatterns");
7+
const ParamsPatterns_1 = require("../pattern/ParamsPatterns");
68
function processUriMatchInIndexRange(uriMatch, urlMatchList) {
79
let obj_part = {
810
uriDetected: undefined,
@@ -34,3 +36,26 @@ function processUriMatchInIndexRange(uriMatch, urlMatchList) {
3436
function processAllUriMatches(uriMatchList, urlMatchList) {
3537
return uriMatchList.map((uriMatch) => processUriMatchInIndexRange(uriMatch, urlMatchList));
3638
}
39+
/**
40+
* Adjusts the URI regex based on the boundary condition.
41+
* @param urisRxStr - The base URI regex string.
42+
* @param endBoundary - Whether to apply the end boundary condition.
43+
* @returns The adjusted URI regex string.
44+
*/
45+
function adjustUrisRx(urisRxStr, endBoundary) {
46+
if (endBoundary) {
47+
return '(?:\\/[^\\s]*\\/|' +
48+
'(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
49+
+ '[^/\\s]*(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
50+
+ '\\/|\\/|\\b)' +
51+
'(?:' + urisRxStr + ')' +
52+
'(?:' + ParamsPatterns_1.ParamsPatterns.mandatoryUrlParams + '|[\\s]|$)';
53+
}
54+
else {
55+
return '(?:\\/[^\\s]*\\/|' +
56+
'(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
57+
+ '[^/\\s]*(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
58+
+ '\\/|\\/|\\b)' +
59+
'(?:' + urisRxStr + ')' + ParamsPatterns_1.ParamsPatterns.optionalUrlParams;
60+
}
61+
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
33
return (mod && mod.__esModule) ? mod : { "default": mod };
44
};
55
Object.defineProperty(exports, "__esModule", { value: true });
6-
exports.Normalizer = void 0;
6+
exports.UrlNormalizer = void 0;
77
const util_1 = __importDefault(require("../util"));
88
const FuzzyPartialUrlPatterns_1 = require("../pattern/FuzzyPartialUrlPatterns");
99
const BasePatterns_1 = require("../pattern/BasePatterns");
1010
const ProtocolPatterns_1 = require("../pattern/ProtocolPatterns");
1111
const DomainPatterns_1 = require("../pattern/DomainPatterns");
12-
exports.Normalizer = {
12+
exports.UrlNormalizer = {
1313
modifiedUrl: null,
1414
extractAndNormalizeProtocolFromSpacesRemovedUrl() {
1515
if (this.modifiedUrl == undefined) {

dist/service/EmailAreaService.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ const util_1 = __importDefault(require("../util"));
99
const BasePatterns_1 = require("../pattern/BasePatterns");
1010
const DomainPatterns_1 = require("../pattern/DomainPatterns");
1111
exports.EmailAreaService = {
12-
assortEmail(email) {
12+
parseEmail(email) {
1313
let obj = {
1414
email: null,
1515
removedTailOnEmail: null,

dist/service/TextAreaService.js

Lines changed: 48 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -8,142 +8,105 @@ const SafeConditionalUrlPatternBuilder_1 = require("../pattern/SafeConditionalUr
88
const BasePatterns_1 = require("../pattern/BasePatterns");
99
const DomainPatterns_1 = require("../pattern/DomainPatterns");
1010
const util_1 = __importDefault(require("../util"));
11-
const ParamsPatterns_1 = require("../pattern/ParamsPatterns");
1211
const EmailPatternBuilder_1 = require("../pattern/EmailPatternBuilder");
1312
const UrlAreaService_1 = require("./UrlAreaService");
1413
const EmailAreaService_1 = require("./EmailAreaService");
14+
const UriMatchProcessor_1 = require("../bo/UriMatchProcessor");
15+
const EmailMatchProcessor_1 = require("../bo/EmailMatchProcessor");
1516
exports.TextAreaService = {
16-
extractAllPureUrls(textStr) {
17+
extractAllUrlMatchList(textStr) {
1718
if (!(textStr && typeof textStr === 'string')) {
1819
throw new Error('the variable textStr must be a string type and not be null.');
1920
}
20-
let obj = [];
21-
let rx = new RegExp(SafeConditionalUrlPatternBuilder_1.SafeConditionalUrlPatternBuilder.getUrl, 'gi');
22-
let matches = [];
21+
const urlRx = new RegExp(SafeConditionalUrlPatternBuilder_1.SafeConditionalUrlPatternBuilder.getUrl, 'gi');
22+
let urlMatchList = [];
2323
let match;
24-
while ((match = rx.exec(textStr)) !== null) {
25-
/* SKIP DEPENDENCY */
24+
while ((match = urlRx.exec(textStr)) !== null) {
25+
/* EXCLUDED FROM MATCH LIST : Email Type */
2626
if (/^@/.test(match[0])) {
2727
continue;
2828
}
2929
let startIdx = match.index;
3030
let endIdx = match.index + match[0].length;
31-
let modVal = match[0];
32-
let re = UrlAreaService_1.UrlAreaService.parseUrl(modVal);
33-
/* SKIP DEPENDENCY */
34-
if (re.onlyDomain && new RegExp('^(?:\\.|[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + ')+$', 'i').test(re.onlyDomain)) {
31+
const parsedUrl = UrlAreaService_1.UrlAreaService.parseUrl(match[0]);
32+
/* EXCLUDED FROM MATCH LIST */
33+
if (parsedUrl.onlyDomain && new RegExp('^(?:\\.|[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + ')+$', 'i').test(parsedUrl.onlyDomain)) {
3534
// ipV4 is OK
36-
if (!new RegExp('^' + DomainPatterns_1.DomainPatterns.ipV4 + '$', 'i').test(re.onlyDomain)) {
35+
if (!new RegExp('^' + DomainPatterns_1.DomainPatterns.ipV4 + '$', 'i').test(parsedUrl.onlyDomain)) {
3736
continue;
3837
}
3938
}
40-
/* this part doesn't need to be included */
41-
if (re.removedTailOnUrl && re.removedTailOnUrl.length > 0) {
42-
endIdx -= re.removedTailOnUrl.length;
39+
/* Adjust endIdx by the length of removedTailOnUrl, if it exists */
40+
if (parsedUrl.removedTailOnUrl && parsedUrl.removedTailOnUrl.length > 0) {
41+
endIdx -= parsedUrl.removedTailOnUrl.length;
4342
}
44-
obj.push({
45-
value: re,
43+
urlMatchList.push({
44+
value: parsedUrl,
4645
area: 'text',
4746
index: {
4847
start: startIdx,
4948
end: endIdx
5049
}
5150
});
5251
}
53-
return obj;
52+
return urlMatchList;
5453
},
5554
/*
5655
* [!!IMPORTANT] Should be refactored.
5756
* */
58-
extractCertainPureUris(textStr, uris, endBoundary) {
59-
let uriRx = util_1.default.Text.urisToOneRxStr(uris);
60-
if (!uriRx) {
57+
extractCertainUriMatchList(textStr, uris, endBoundary) {
58+
let urisRxStr = util_1.default.Text.urisToOneRxStr(uris);
59+
if (!urisRxStr) {
6160
throw new Error('the variable uris are not available');
6261
}
63-
if (endBoundary) {
64-
uriRx = '(?:\\/[^\\s]*\\/|' +
65-
'(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
66-
+ '[^/\\s]*(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
67-
+ '\\/|\\/|\\b)' +
68-
'(?:' + uriRx + ')' +
69-
'(?:' + ParamsPatterns_1.ParamsPatterns.mandatoryUrlParams + '|[\\s]|$)';
70-
}
71-
else {
72-
uriRx = '(?:\\/[^\\s]*\\/|' +
73-
'(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
74-
+ '[^/\\s]*(?:[0-9]|' + BasePatterns_1.BasePatterns.twoBytesNum + '|' + BasePatterns_1.BasePatterns.langChar + ')'
75-
+ '\\/|\\/|\\b)' +
76-
'(?:' + uriRx + ')' + ParamsPatterns_1.ParamsPatterns.optionalUrlParams;
77-
}
78-
let obj = [];
79-
/* normal text area */
80-
let rx = new RegExp(uriRx, 'gi');
62+
urisRxStr = (0, UriMatchProcessor_1.adjustUrisRx)(urisRxStr, endBoundary);
63+
let uriMatchList = [];
64+
const uriRx = new RegExp(urisRxStr, 'gi');
8165
let match;
82-
while ((match = rx.exec(textStr)) !== null) {
83-
let mod_val = match[0];
84-
obj.push({
85-
value: UrlAreaService_1.UrlAreaService.parseUrl(mod_val),
66+
while ((match = uriRx.exec(textStr)) !== null) {
67+
uriMatchList.push({
68+
value: UrlAreaService_1.UrlAreaService.parseUrl(match[0]),
8669
area: 'text',
8770
index: {
8871
start: match.index,
8972
end: match.index + match[0].length
9073
}
9174
});
9275
}
93-
return obj;
76+
return uriMatchList;
9477
},
95-
extractAllPureEmails(textStr, finalPrefixSanitizer) {
78+
extractAllEmailMatchList(textStr, finalPrefixSanitizer) {
9679
if (!(textStr && typeof textStr === 'string')) {
9780
throw new Error('the variable textStr must be a string type and not be null.');
9881
}
99-
let obj = [];
100-
let rx = new RegExp(EmailPatternBuilder_1.EmailPatternBuilder.getEmail, 'gi');
82+
let emailMatchList = [];
83+
const emailRx = new RegExp(EmailPatternBuilder_1.EmailPatternBuilder.getEmail, 'gi');
10184
let match;
102-
while ((match = rx.exec(textStr)) !== null) {
103-
let mod_val = match[0];
104-
let mod_val_front = mod_val.split(/@/)[0];
105-
let st_idx = match.index;
106-
let end_idx = match.index + match[0].length;
107-
/* prefixSanitizer */
85+
while ((match = emailRx.exec(textStr)) !== null) {
86+
let matchedEmail = match[0];
87+
let matchedEmailFront = matchedEmail.split(/@/)[0];
88+
let startIdx = match.index;
89+
let endIdx = match.index + match[0].length;
10890
if (finalPrefixSanitizer) {
109-
// the 'border' is a en char that divides non-en and en areas.
110-
let border = '';
111-
let removedLength = 0;
112-
let rx_left_plus_border = new RegExp('^([^a-zA-Z0-9]+)([a-zA-Z0-9])', '');
113-
let is_mod_val_front_only_foreign_lang = true;
114-
let match2;
115-
if ((match2 = rx_left_plus_border.exec(mod_val_front)) !== null) {
116-
is_mod_val_front_only_foreign_lang = false;
117-
//console.log('match2:' + match2);
118-
if (match2[1]) {
119-
removedLength = match2[1].length;
120-
}
121-
if (match2[2]) {
122-
border = match2[2];
123-
}
124-
}
125-
if (is_mod_val_front_only_foreign_lang === false) {
126-
mod_val = mod_val.replace(rx_left_plus_border, '');
127-
mod_val = border + mod_val;
128-
}
129-
st_idx += removedLength;
91+
const { sanitizedEmail, removedLength } = (0, EmailMatchProcessor_1.sanitizeEmailPrefix)(matchedEmailFront, matchedEmail);
92+
matchedEmail = sanitizedEmail;
93+
startIdx += removedLength;
13094
}
131-
let re = EmailAreaService_1.EmailAreaService.assortEmail(mod_val);
132-
//console.log('re : ' + re);
133-
/* this part doesn't need to be included */
134-
if (re.removedTailOnEmail && re.removedTailOnEmail.length > 0) {
135-
end_idx -= re.removedTailOnEmail.length;
95+
let parsedEmail = EmailAreaService_1.EmailAreaService.parseEmail(matchedEmail);
96+
/* Adjust endIdx by the length of removedTailOnUrl, if it exists */
97+
if (parsedEmail.removedTailOnEmail && parsedEmail.removedTailOnEmail.length > 0) {
98+
endIdx -= parsedEmail.removedTailOnEmail.length;
13699
}
137-
obj.push({
138-
value: re,
100+
emailMatchList.push({
101+
value: parsedEmail,
139102
area: 'text',
140103
index: {
141-
start: st_idx,
142-
end: end_idx
104+
start: startIdx,
105+
end: endIdx
143106
},
144-
pass: EmailAreaService_1.EmailAreaService.strictTest(re.email)
107+
pass: EmailAreaService_1.EmailAreaService.strictTest(parsedEmail.email)
145108
});
146109
}
147-
return obj;
110+
return emailMatchList;
148111
}
149112
};

dist/service/UrlAreaService.js

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ const BasePatterns_1 = require("../pattern/BasePatterns");
1010
const ProtocolPatterns_1 = require("../pattern/ProtocolPatterns");
1111
const DomainPatterns_1 = require("../pattern/DomainPatterns");
1212
const SafeConditionalUrlPatternBuilder_1 = require("../pattern/SafeConditionalUrlPatternBuilder");
13-
const Normalizer_1 = require("../bo/Normalizer");
13+
const UrlNormalizer_1 = require("../bo/UrlNormalizer");
1414
const queryString = require('query-string');
1515
exports.UrlAreaService = {
1616
/**
@@ -37,21 +37,21 @@ exports.UrlAreaService = {
3737
try {
3838
url = valid_1.default.validateAndTrimString(url);
3939
/* Chapter 1. Normalizing process */
40-
Normalizer_1.Normalizer.modifiedUrl = util_1.default.Text.removeAllSpaces(url);
40+
UrlNormalizer_1.UrlNormalizer.modifiedUrl = util_1.default.Text.removeAllSpaces(url);
4141
// 1. full url
4242
obj.url = url;
4343
// 2. protocol
44-
obj.protocol = Normalizer_1.Normalizer.extractAndNormalizeProtocolFromSpacesRemovedUrl();
44+
obj.protocol = UrlNormalizer_1.UrlNormalizer.extractAndNormalizeProtocolFromSpacesRemovedUrl();
4545
// 3. Domain
46-
let domainWithType = Normalizer_1.Normalizer.extractAndNormalizeDomainFromProtocolRemovedUrl();
46+
let domainWithType = UrlNormalizer_1.UrlNormalizer.extractAndNormalizeDomainFromProtocolRemovedUrl();
4747
obj.type = domainWithType.type;
4848
obj.onlyDomain = domainWithType.domain;
4949
// 4. Port
50-
obj.port = Normalizer_1.Normalizer.extractAndNormalizePortFromDomainRemovedUrl();
50+
obj.port = UrlNormalizer_1.UrlNormalizer.extractAndNormalizePortFromDomainRemovedUrl();
5151
// 5. Finalize
52-
obj.normalizedUrl = Normalizer_1.Normalizer.finalizeNormalization(obj.protocol, obj.port, obj.onlyDomain);
52+
obj.normalizedUrl = UrlNormalizer_1.UrlNormalizer.finalizeNormalization(obj.protocol, obj.port, obj.onlyDomain);
5353
// 6. Params & URI
54-
let uriParams = Normalizer_1.Normalizer.extractAndNormalizeUriParamsFromPortRemovedUrl();
54+
let uriParams = UrlNormalizer_1.UrlNormalizer.extractAndNormalizeUriParamsFromPortRemovedUrl();
5555
obj.onlyUri = uriParams.uri;
5656
obj.onlyParams = uriParams.params;
5757
/* Chapter 2. Post normalizing process (same as the function 'parseUrl')*/
@@ -429,8 +429,6 @@ exports.UrlAreaService = {
429429
obj.onlyUri = obj.url.replace(/\?[^/]*$/gi, '');
430430
}
431431
}
432-
//obj.normalizedUrl = this.normalizeUrl(obj.url)['normalizedUrl'];
433-
//}
434432
}
435433
catch (e) {
436434
console.log(e);

dist/service/XmlAreaService.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ exports.XmlAreaService = {
88
let matches = [];
99
let match;
1010
while ((match = rx.exec(xmlStr)) !== null) {
11-
//console.log(match[0].split(/[\t\s]+|>/)[0]);
1211
matches.push({
1312
'value': match[0],
1413
'elementName': match[0].split(/[\t\s]+|>/)[0].replace(/^</, ''),

dist/url-knife.bundle.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "url-knife",
3-
"version": "4.1.3",
3+
"version": "4.1.5",
44
"description": "Extract and decompose (fuzzy) URLs (including emails, which are conceptually a part of URLs) in texts with robust patterns.",
55
"main": "src/entry.ts",
66
"scripts": {

0 commit comments

Comments
 (0)