Skip to content

Commit 32b714f

Browse files
fix : Remove extractAllFuzzyURLs as this sort of thing should be conducted by AI
1 parent 1e2c04b commit 32b714f

File tree

5 files changed

+4
-442
lines changed

5 files changed

+4
-442
lines changed

README.md

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Url-knife [![Build Status](https://travis-ci.org/patternknife/url-knife.svg?branch=master)](https://travis-ci.org/patternknife/url-knife) [![NPM version](https://img.shields.io/npm/v/url-knife.svg)](https://www.npmjs.com/package/url-knife) [![](https://data.jsdelivr.com/v1/package/gh/patternknife/url-knife/badge)](https://www.jsdelivr.com/package/gh/patternknife/url-knife) [![](https://badgen.net/bundlephobia/minzip/url-knife)](https://bundlephobia.com/result?p=url-knife)
1+
# Url-knife [![NPM version](https://img.shields.io/npm/v/url-knife.svg)](https://www.npmjs.com/package/url-knife) [![](https://data.jsdelivr.com/v1/package/gh/patternknife/url-knife/badge)](https://www.jsdelivr.com/package/gh/patternknife/url-knife) [![](https://badgen.net/bundlephobia/minzip/url-knife)](https://bundlephobia.com/result?p=url-knife)
22
## Overview
33
Extract and decompose (fuzzy) URLs (including emails, which are conceptually a part of URLs) in texts with robust patterns.
44

@@ -35,9 +35,7 @@ import Pattern from 'url-knife';
3535

3636
[Chapter 3. Extract URIs with certain names](#chapter-3-extract-uris-with-certain-names)
3737

38-
[Chapter 4. Extract all fuzzy URLs](#chapter-4-extract-all-fuzzy-urls) (False positives detected)
39-
40-
[Chapter 5. Extract all URLs in raw HTML or XML](#chapter-5-extract-all-urls-in-raw-html-or-xml)
38+
[Chapter 4. Extract all URLs in raw HTML or XML](#chapter-4-extract-all-urls-in-raw-html-or-xml)
4139

4240

4341
#### Chapter 1. Normalize or parse one URL
@@ -450,31 +448,8 @@ var sampleText = 'https://google.com/abc/777?a=5&b=7 abc/def 333/kak abc/55에
450448
}
451449
]
452450
```
453-
454-
#### Chapter 4. Extract all fuzzy URLs
455-
##### The strongest url extracting method of URL-knife in natural language texts. However, this does not detect intranets due to false positives. If you need to extract intranets, go back to the Chapter 2 above.
456-
457-
``` javascript
458-
var textStr = '142 .42.1.1:8080 123.45 xtp://--[::1]:8000에서 h ttpp ;//-www.ex ample;com -/wpstyle/??p=3?6/4&x=5/3 in the ssh h::/;/ww.example.com/wpstyle/?p=364 is ok ' +
459-
'h ttp:/://132 .42.,1.1 HT TP:// foo, co,.kr/blah_blah_(wikipedia) https://www.google .org :8005/maps/place/USA/@36.2218457,... tnae1ver.co. jp;8000on the internet Asterisk\n ' +
460-
'the packed1book.net. 가나다@apacbook.ac.kr fakeshouldnotbedetected.url?abc=fake s5houl7十七日dbedetected.jp?japan=go&html=<span>가나다@pacbook.travelersinsurance</span>;' +
461-
' abc,com//ad/fg/?kk=5 abc@daum.net Have you visited http://agoasidaio.ac.kr?abd=55...,.&kkk=5rk.,, ' +
462-
'Have <b>you</b> visited goasidaio.ac.kr?abd=5hell0?5...&kkk=5rk.,. ';
463-
464-
/**
465-
* @brief
466-
* Distill all urls including fuzzy matched ones from normal text
467-
* @author Andrew Kang
468-
* @param textStr string required
469-
470-
471-
var urls = Pattern.TextArea.extractAllFuzzyUrls(textStr)
472-
```
473-
###### console.log()
474-
<a href="https://jsfiddle.net/AndrewKang/p0tc4ovb/" target="_blank">LIVE DEMO</a>
475-
476451

477-
#### Chapter 5. Extract all URLs in raw HTML or XML
452+
#### Chapter 4. Extract all URLs in raw HTML or XML
478453

479454
``` javascript
480455
// The sample of 'XML (HTML)'
@@ -538,4 +513,4 @@ var urls = PatternExtractor.XmlArea.extractAllUrls(xmlStr);
538513
]
539514
```
540515

541-
Please inform me of more sophisticated patterns you need by leaving issues on Github or emailing me at studypurpose@naver.com.
516+
Please inform me of more sophisticated patterns you need by leaving issues or emailing me at studypurpose@naver.com.

src/controller.js

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,12 @@
1-
import Util from './util';
21
import Pattern from './pattern';
32
import Service from './service';
43

5-
import Valid from './valid';
6-
74
/*
85
* All Public
96
* */
107

118
const TextArea = {
129

13-
/**
14-
* @brief
15-
* Distill all urls including fuzzy matched ones from normal text
16-
* @author Andrew Kang
17-
* @param textStr string required
18-
* @return array
19-
*/
20-
extractAllFuzzyUrls(textStr) {
21-
//Pattern.Children.setUrlPattern(noProtocolJsn);
22-
23-
//console.log('a : ' + Pattern.Children.url);
24-
25-
return Service.Text.extractAllFuzzyUrls(textStr);
26-
27-
},
2810

2911
/**
3012
* @brief
@@ -156,7 +138,6 @@ const TextArea = {
156138

157139
};
158140

159-
160141
const UrlArea = {
161142

162143
/**

src/pattern.js

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -174,54 +174,6 @@ const Descendants = {
174174
')';
175175
},
176176

177-
get fuzzy_root_domains() {
178-
179-
let alls = Ancestors.all_root_domains;
180-
alls = alls.replace(/^\(\?:|\)$/, '');
181-
182-
let arrs = alls.split('|');
183-
184-
let whole_rx = '(?:';
185-
for (let a = 0; a < arrs.length; a++) {
186-
187-
let full_rx = '(?:[0-9]|[\\n\\r\\t\\s]|' + Ancestors.all_keypad_meta_chars + '|';
188-
189-
let part_arrs = [];
190-
let part_rx = '[';
191-
192-
let one = arrs[a];
193-
for (let b = 0; b < one.length; b++) {
194-
195-
let cr = one.charAt(b);
196-
197-
part_rx += cr;
198-
part_arrs.push(cr);
199-
200-
}
201-
202-
part_rx += ']';
203-
204-
full_rx += part_rx + '|)';
205-
206-
for (let c = 0; c < part_arrs.length; c++) {
207-
208-
if (c < part_arrs.length - 1) {
209-
whole_rx += part_arrs[c] + full_rx;
210-
} else {
211-
whole_rx += part_arrs[c];
212-
}
213-
}
214-
215-
if (a < arrs.length - 1) {
216-
whole_rx += '|';
217-
}
218-
}
219-
220-
//console.log('w : ' + whole_rx);
221-
222-
return whole_rx;
223-
224-
},
225177

226178
fuzzy_domain_end:
227179
'(?:[\\n\\r\\t\\s]|' + Ancestors.all_keypad_meta_chars_without_delimiters + '){0,2}?' + Ancestors.end_punc_regarded_char +

src/service.js

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -9,68 +9,6 @@ const queryString = require('query-string');
99
* */
1010
const Text = {
1111

12-
extractAllFuzzyUrls(textStr) {
13-
14-
if (!(textStr && typeof textStr === 'string')) {
15-
throw new Error('the variable textStr must be a string type and not be null.');
16-
}
17-
18-
// To increase the accuracy of applying regexes...
19-
textStr = textStr.replace(/[\n\r\t\s]{2,}/, ' ');
20-
21-
let obj = [];
22-
23-
let rx = new RegExp(Pattern.Children.fuzzy_url(), 'gi');
24-
25-
let matches = [];
26-
let match = {};
27-
28-
while ((match = rx.exec(textStr)) !== null) {
29-
30-
/* SKIP DEPENDENCY */
31-
/* if (/^@/.test(match[0])) {
32-
continue;
33-
}*/
34-
35-
/* this can affect indexes so commented */
36-
//mod_val = mod_val.replace(/[\n\r\t\s]/g, '');
37-
38-
let st_idx = match.index;
39-
let end_idx = match.index + match[0].length;
40-
41-
let mod_val = match[0];
42-
let re = Url.normalizeUrl(mod_val);
43-
44-
/* SKIP DEPENDENCY */
45-
46-
// Decimals
47-
if (new RegExp('^(?:\\.|[0-9]|' + Pattern.Ancestors.two_bytes_num + '|[\\n\\r\\t\\s])+$', 'i').test(re['url'])) {
48-
// ip_v4 is OK
49-
if (!new RegExp('^' + Pattern.Ancestors.ip_v4 + '$', 'i').test(re['onlyDomain'])) {
50-
continue;
51-
}
52-
}
53-
54-
55-
/* this part doesn't need to be included */
56-
/* if (re['removedTailOnUrl'] && re['removedTailOnUrl'].length > 0) {
57-
end_idx -= re['removedTailOnUrl'].length;
58-
}*/
59-
60-
obj.push({
61-
'value': re,
62-
'area': 'text'
63-
/* 'index': {
64-
'start': st_idx,
65-
'end': end_idx
66-
}*/
67-
});
68-
}
69-
70-
return obj;
71-
72-
},
73-
7412
extractAllPureUrls(textStr) {
7513

7614
//console.log('a : ' + Pattern.Children.url);

0 commit comments

Comments
 (0)