fix : Remove extractAllFuzzyURLs as this sort of thing should be conducted by AI

patternhelloworld · patternhelloworld · commit 32b714f3f7ce · 2024-07-21T23:53:06.000+09:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Url-knife [![Build Status](https://travis-ci.org/patternknife/url-knife.svg?branch=master)](https://travis-ci.org/patternknife/url-knife) [![NPM version](https://img.shields.io/npm/v/url-knife.svg)](https://www.npmjs.com/package/url-knife) [![](https://data.jsdelivr.com/v1/package/gh/patternknife/url-knife/badge)](https://www.jsdelivr.com/package/gh/patternknife/url-knife) [![](https://badgen.net/bundlephobia/minzip/url-knife)](https://bundlephobia.com/result?p=url-knife)
+# Url-knife [![NPM version](https://img.shields.io/npm/v/url-knife.svg)](https://www.npmjs.com/package/url-knife) [![](https://data.jsdelivr.com/v1/package/gh/patternknife/url-knife/badge)](https://www.jsdelivr.com/package/gh/patternknife/url-knife) [![](https://badgen.net/bundlephobia/minzip/url-knife)](https://bundlephobia.com/result?p=url-knife)
 ## Overview
 Extract and decompose (fuzzy) URLs (including emails, which are conceptually a part of URLs) in texts with robust patterns.
 
@@ -35,9 +35,7 @@ import Pattern from 'url-knife';
 
 [Chapter 3. Extract URIs with certain names](#chapter-3-extract-uris-with-certain-names)
 
-[Chapter 4. Extract all fuzzy URLs](#chapter-4-extract-all-fuzzy-urls) (False positives detected)
-
-[Chapter 5. Extract all URLs in raw HTML or XML](#chapter-5-extract-all-urls-in-raw-html-or-xml)
+[Chapter 4. Extract all URLs in raw HTML or XML](#chapter-4-extract-all-urls-in-raw-html-or-xml)
 
 
 #### Chapter 1. Normalize or parse one URL
@@ -450,31 +448,8 @@ var sampleText = 'https://google.com/abc/777?a=5&b=7 abc/def 333/kak abc/55에
   }
 ]
 ```
- 
-#### Chapter 4. Extract all fuzzy URLs
-##### The strongest url extracting method of URL-knife in natural language texts. However, this does not detect intranets due to false positives. If you need to extract intranets, go back to the Chapter 2 above. 
-
-``` javascript
-var textStr = '142 .42.1.1:8080 123.45 xtp://--[::1]:8000에서 h ttpp ;//-www.ex ample;com    -/wpstyle/??p=3?6/4&x=5/3 in the ssh h::/;/ww.example.com/wpstyle/?p=364 is ok ' +
-         'h ttp:/://132 .42.,1.1 HT TP:// foo, co,.kr/blah_blah_(wikipedia) https://www.google .org :8005/maps/place/USA/@36.2218457,... tnae1ver.co. jp;8000on the internet  Asterisk\n ' +
-         'the packed1book.net. 가나다@apacbook.ac.kr fakeshouldnotbedetected.url?abc=fake s5houl７十七日dbedetected.jp?japan=go&html=<span>가나다@pacbook.travelersinsurance</span>;' +
-         ' abc,com//ad/fg/?kk=5 abc@daum.net Have you visited http://agoasidaio.ac.kr?abd=55...,.&kkk=5rk.,, ' +
-         'Have <b>you</b> visited goasidaio.ac.kr?abd=5hell0?5...&kkk=5rk.,. ';
- 
-     /**
-      * @brief
-     * Distill all urls including fuzzy matched ones from normal text
-      * @author Andrew Kang
-      * @param textStr string required
-        
-       
- var urls = Pattern.TextArea.extractAllFuzzyUrls(textStr)
- ```
- ###### console.log() 
-<a href="https://jsfiddle.net/AndrewKang/p0tc4ovb/" target="_blank">LIVE DEMO</a>
-
 
-#### Chapter 5. Extract all URLs in raw HTML or XML
+#### Chapter 4. Extract all URLs in raw HTML or XML
   
 ``` javascript
     // The sample of 'XML (HTML)'
@@ -538,4 +513,4 @@ var urls = PatternExtractor.XmlArea.extractAllUrls(xmlStr);
  ]
 ```
 
-Please inform me of more sophisticated patterns you need by leaving issues on Github or emailing me at studypurpose@naver.com.
+Please inform me of more sophisticated patterns you need by leaving issues or emailing me at studypurpose@naver.com.
diff --git a/src/controller.js b/src/controller.js
@@ -1,30 +1,12 @@
-import Util from './util';
 import Pattern from './pattern';
 import Service from './service';
 
-import Valid from './valid';
-
 /*
 *     All Public
 * */
 
 const TextArea = {
 
-    /**
-     * @brief
-     * Distill all urls including fuzzy matched ones from normal text
-     * @author Andrew Kang
-     * @param textStr string required
-     * @return array
-     */
-    extractAllFuzzyUrls(textStr) {
-        //Pattern.Children.setUrlPattern(noProtocolJsn);
-
-        //console.log('a : ' + Pattern.Children.url);
-
-        return Service.Text.extractAllFuzzyUrls(textStr);
-
-    },
 
     /**
      * @brief
@@ -156,7 +138,6 @@ const TextArea = {
 
 };
 
-
 const UrlArea = {
 
     /**
diff --git a/src/pattern.js b/src/pattern.js
@@ -174,54 +174,6 @@ const Descendants = {
             ')';
     },
 
-    get fuzzy_root_domains() {
-
-        let alls = Ancestors.all_root_domains;
-        alls = alls.replace(/^\(\?:|\)$/, '');
-
-        let arrs = alls.split('|');
-
-        let whole_rx = '(?:';
-        for (let a = 0; a < arrs.length; a++) {
-
-            let full_rx = '(?:[0-9]|[\\n\\r\\t\\s]|' + Ancestors.all_keypad_meta_chars + '|';
-
-            let part_arrs = [];
-            let part_rx = '[';
-
-            let one = arrs[a];
-            for (let b = 0; b < one.length; b++) {
-
-                let cr = one.charAt(b);
-
-                part_rx += cr;
-                part_arrs.push(cr);
-
-            }
-
-            part_rx += ']';
-
-            full_rx += part_rx + '|)';
-
-            for (let c = 0; c < part_arrs.length; c++) {
-
-                if (c < part_arrs.length - 1) {
-                    whole_rx += part_arrs[c] + full_rx;
-                } else {
-                    whole_rx += part_arrs[c];
-                }
-            }
-
-            if (a < arrs.length - 1) {
-                whole_rx += '|';
-            }
-        }
-
-        //console.log('w : ' + whole_rx);
-
-        return whole_rx;
-
-    },
 
     fuzzy_domain_end:
     '(?:[\\n\\r\\t\\s]|' + Ancestors.all_keypad_meta_chars_without_delimiters + '){0,2}?' + Ancestors.end_punc_regarded_char +
diff --git a/src/service.js b/src/service.js
@@ -9,68 +9,6 @@ const queryString = require('query-string');
 * */
 const Text = {
 
-    extractAllFuzzyUrls(textStr) {
-
-        if (!(textStr && typeof textStr === 'string')) {
-            throw new Error('the variable textStr must be a string type and not be null.');
-        }
-
-        // To increase the accuracy of applying regexes...
-        textStr = textStr.replace(/[\n\r\t\s]{2,}/, ' ');
-
-        let obj = [];
-
-        let rx = new RegExp(Pattern.Children.fuzzy_url(), 'gi');
-
-        let matches = [];
-        let match = {};
-
-        while ((match = rx.exec(textStr)) !== null) {
-
-            /* SKIP DEPENDENCY */
-            /*           if (/^@/.test(match[0])) {
-                           continue;
-                       }*/
-
-            /* this can affect indexes so commented */
-            //mod_val = mod_val.replace(/[\n\r\t\s]/g, '');
-
-            let st_idx = match.index;
-            let end_idx = match.index + match[0].length;
-
-            let mod_val = match[0];
-            let re = Url.normalizeUrl(mod_val);
-
-            /* SKIP DEPENDENCY */
-
-            // Decimals
-            if (new RegExp('^(?:\\.|[0-9]|' + Pattern.Ancestors.two_bytes_num + '|[\\n\\r\\t\\s])+$', 'i').test(re['url'])) {
-                // ip_v4 is OK
-                if (!new RegExp('^' + Pattern.Ancestors.ip_v4 + '$', 'i').test(re['onlyDomain'])) {
-                    continue;
-                }
-            }
-
-
-            /* this part doesn't need to be included */
-            /*         if (re['removedTailOnUrl'] && re['removedTailOnUrl'].length > 0) {
-                         end_idx -= re['removedTailOnUrl'].length;
-                     }*/
-
-            obj.push({
-                'value': re,
-                'area': 'text'
-                /*      'index': {
-                          'start': st_idx,
-                          'end': end_idx
-                      }*/
-            });
-        }
-
-        return obj;
-
-    },
-
     extractAllPureUrls(textStr) {
 
         //console.log('a : ' + Pattern.Children.url);
diff --git a/test/test.js b/test/test.js