🚀 Big News: Socket Acquires Coana to Bring Reachability Analysis to Every Appsec Team.Learn more →

Book a Demo Install Sign in

parse-urls

Package Overview

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

parse-urls

Parse (fuzzy) URLs with robust patterns

2.7.6

latest

Source

npm

Version published: 6 years ago

Weekly downloads: 3

Maintainers: 1

Weekly downloads

Created: 6 years ago

Source

Parse-urls

Overview

Parse (fuzzy) URLs with robust patterns.

URL extractor

LIVE DEMO

If you can't see any results on 'jsfiddle.net', change the 'Editor layout' like this.

how-to-see

Fuzzy URL extractor

LIVE DEMO

Installation

For ES5 users,

<html>
       <body>
       	<p id="content"></p>
       	<script src="../dist/parse-urls.bundle.js"></script>
       	<script type="text/javascript">
       
      
       	</script>
       </body>
</html>

For ES6 npm users, do 'npm install --save parse-urls' on console.

import Pattern from 'parse-urls';

Syntax & Usage

Chapter 1. Normalize or parse one URL

Chapter 2. Extract all URLs

Chapter 3. Extract URIs with certain names

Chapter 4. Extract all fuzzy URLs

Chapter 5. Highlight all URLs in texts

Chapter 6. Extract all URLs in raw HTML or XML

Chapter 1. Normalize or parse one URL

/**
* @brief
* Normalize an url with potential human errors (Intranet urls are not normalized.)
*/
var sample1 = Pattern.UrlArea.normalizeUrl("htp/:/abcgermany.,def;:9094 #park//noon??abc=retry")
var sample2 = Pattern.UrlArea.normalizeUrl("'://abc.jppp:9091 /park/noon'")
var sample3 = Pattern.UrlArea.normalizeUrl("ss hd : /university,.acd. ;jpkp: 9091/adc??abc=.com")

console.log()

{
 "url": "htp/:/abcgermany.,def;:9094 #park//noon??abc=retry",
 "normalizedUrl": "http://abcgermany.de:9094#park/noon?abc=retry",
 "removedTailOnUrl": "",
 "protocol": "http",
 "onlyDomain": "abcgermany.de",
 "onlyParams": "?abc=retry",
 "onlyUri": "#park/noon",
 "onlyUriWithParams": "#park/noon?abc=retry",
 "onlyParamsJsn": {
   "abc": "retry"
 },
 "type": "domain",
 "port": "9094"
}
{
 "url": "'://abc.jppp:9091 /park/noon'",
 "normalizedUrl": "abc.jp:9091/park/noon",
 "removedTailOnUrl": "'",
 "protocol": null,
 "onlyDomain": "abc.jp",
 "onlyParams": null,
 "onlyUri": "/park/noon'",
 "onlyUriWithParams": "/park/noon'",
 "onlyParamsJsn": null,
 "type": "domain",
 "port": "9091"
}
{
 "url": "ss hd : /university,.acd. ;jpkp로 접속",
 "normalizedUrl": "ssh://university.ac.jp",
 "removedTailOnUrl": "",
 "protocol": "ssh",
 "onlyDomain": "university.ac.jp",
 "onlyParams": null,
 "onlyUri": null,
 "onlyUriWithParams": null,
 "onlyParamsJsn": null,
 "type": "domain",
 "port": null
}

/**
* @brief
* Parse an url with no potential human errors
*/
var url = Pattern.UrlArea.parseUrl("xtp://gooppalgo.com/park/tree/?abc=1")

console.log()

{
 "url": "xtp://gooppalgo.com/park/tree/?abc=1",
 "removedTailOnUrl": "",
 "protocol": "xtp (unknown protocol)",
 "onlyDomain": "gooppalgo.com",
 "onlyParams": "?abc=1",
 "onlyUri": "/park/tree/",
 "onlyUriWithParams": "/park/tree/?abc=1",
 "onlyParamsJsn": {
   "abc": "1"
 },
 "type": "domain",
 "port": null
}

Chapter 2. Extract all URLs

    var textStr = 'http://[::1]:8000에서 http ://www.example.com/wpstyle/?p=364 is ok \n' +
        'HTTP://foo.com/blah_blah_(wikipedia) https://www.google.com/maps/place/USA/@36.2218457,... tnae1ver.com:8000on the internet  Asterisk\n ' +
        'the packed1book.net. fakeshouldnotbedetected.url?abc=fake s5houl７十七日dbedetected.jp?japan=go&html=<span>가나다@pacbook.net</span>; abc.com/ad/fg/?kk=5 abc@daum.net' +
        'Have you visited http://goasidaio.ac.kr?abd=5안녕하세요?5...,.&kkk=5rk.,, ' +
        'http://✪df.ws/123\n' +
        'http://142.42.1.1:8080/\n' +
        'http://-.~_!$&\'()*+,;=:%40:80%2f::::::@example.com ' +
        'Have <b>you</b> visited goasidaio.ac.kr?abd=5hell0?5...&kkk=5rk.,. ';
 
     /**
      * @brief
      * Distill all urls from normal text
      * @author Andrew Kang
      * @param textStr string required
      * @param noProtocolJsn object
      *    default :  {
                 'ip_v4' : false,
                 'ip_v6' : false,
                 'localhost' : false,
                 'intranet' : false
             }
       
 var urls = Pattern.TextArea.extractAllUrls(textStr, {
                    'ip_v4' : true,
                    'ip_v6' : false,
                    'localhost' : false,
                    'intranet' : true
})

console.log()

LIVE DEMO

Chapter 3. Extract URIs with certain names


var sampleText = 'https://google.com/abc/777?a=5&b=7 abc/def 333/kak abc/55에서 abc/53 abc/53a/ka /123a/abc/556/dd /abc/123?a=5&b=tkt /xyj/asff' +
               'a333/kak  nice/guy/ bad/or/nice/guy ssh://nice.guy.com/?a=dkdfl';
 
    /**
     * @brief
     * Distill uris with certain names from normal text
     * @author Andrew Kang
     * @param textStr string required
     * @param uris array required
     * for example, [['a','b'], ['c','d']]
     * If you use {number}, this means 'only number' ex) [['a','{number}'], ['c','d']]
     * @param endBoundary boolean (default : false)
     * @return array
     */ 
               
 var uris = Pattern.TextArea.extractCertainUris(sampleText,
  [['{number}', 'kak'], ['nice','guy'],['abc', '{number}']], true)
 
 // 'If endBoundary is set to false, more uris are detected.'
 // This detects all URIs containing '{number}/kak' or nice/guy' or 'abc/{number}'

console.log()

[
 {
   "uri_detected": {
     "value": {
       "url": "/abc/777?a=5&b=7",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": "",
       "onlyParams": "?a=5&b=7",
       "onlyUri": "/abc/777",
       "onlyUriWithParams": "/abc/777?a=5&b=7",
       "onlyParamsJsn": {
         "a": "5",
         "b": "7"
       },
       "type": "domain",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 18,
       "end": 34
     }
   },
   "in_what_url": {
     "value": {
       "url": "https://google.com/abc/777?a=5&b=7",
       "removedTailOnUrl": "",
       "protocol": "https",
       "onlyDomain": "google.com",
       "onlyParams": "?a=5&b=7",
       "onlyUri": "/abc/777",
       "onlyUriWithParams": "/abc/777?a=5&b=7",
       "onlyParamsJsn": {
         "a": "5",
         "b": "7"
       },
       "type": "domain",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 0,
       "end": 34
     }
   }
 },
 {
   "uri_detected": {
     "value": {
       "url": "333/kak",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": null,
       "onlyUri": "333/kak",
       "onlyUriWithParams": "333/kak",
       "onlyParamsJsn": null,
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 43,
       "end": 51
     }
   },
   "in_what_url": null
 },
 {
   "uri_detected": {
     "value": {
       "url": "abc/53",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": null,
       "onlyUri": "abc/53",
       "onlyUriWithParams": "abc/53",
       "onlyParamsJsn": null,
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 60,
       "end": 67
     }
   },
   "in_what_url": null
 },
 {
   "uri_detected": {
     "value": {
       "url": "abc/533/ka",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": null,
       "onlyUri": "abc/533/ka",
       "onlyUriWithParams": "abc/533/ka",
       "onlyParamsJsn": null,
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 67,
       "end": 77
     }
   },
   "in_what_url": null
 },
 {
   "uri_detected": {
     "value": {
       "url": "/123a/abc/556/dd",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": null,
       "onlyUri": "/123a/abc/556/dd",
       "onlyUriWithParams": "/123a/abc/556/dd",
       "onlyParamsJsn": null,
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 89,
       "end": 105
     }
   },
   "in_what_url": null
 },
 {
   "uri_detected": {
     "value": {
       "url": "/abc/123?a=5&b=tkt",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": "?a=5&b=tkt",
       "onlyUri": "/abc/123",
       "onlyUriWithParams": "/abc/123?a=5&b=tkt",
       "onlyParamsJsn": {
         "a": "5",
         "b": "tkt"
       },
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 106,
       "end": 124
     }
   },
   "in_what_url": null
 },
 {
   "uri_detected": {
     "value": {
       "url": "nice/guy",
       "removedTailOnUrl": "/",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": null,
       "onlyUri": "nice/guy",
       "onlyUriWithParams": "nice/guy",
       "onlyParamsJsn": null,
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 144,
       "end": 153
     }
   },
   "in_what_url": null
 },
 {
   "uri_detected": {
     "value": {
       "url": "/or/nice/guy",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": null,
       "onlyParams": null,
       "onlyUri": "/or/nice/guy",
       "onlyUriWithParams": "/or/nice/guy",
       "onlyParamsJsn": null,
       "type": "uri",
       "port": null
     },
     "area": "text",
     "index": {
       "start": 157,
       "end": 170
     }
   },
   "in_what_url": null
 }
]

Chapter 4. Extract all fuzzy URLs

This does not detect intranets due to false positives. If you need to extract intranets, go to the Chapter 4. below.

var textStr = '142 .42.1.1:8080 123.45 xtp://--[::1]:8000에서 h ttpp ;//-www.ex ample;com    -/wpstyle/??p=3?6/4&x=5/3 in the ssh h::/;/ww.example.com/wpstyle/?p=364 is ok ' +
         'h ttp:/://132 .42.,1.1 HT TP:// foo, co,.kr/blah_blah_(wikipedia) https://www.google .org :8005/maps/place/USA/@36.2218457,... tnae1ver.co. jp;8000on the internet  Asterisk\n ' +
         'the packed1book.net. 가나다@apacbook.ac.kr fakeshouldnotbedetected.url?abc=fake s5houl７十七日dbedetected.jp?japan=go&html=<span>가나다@pacbook.travelersinsurance</span>;' +
         ' abc,com//ad/fg/?kk=5 abc@daum.net Have you visited http://agoasidaio.ac.kr?abd=55...,.&kkk=5rk.,, ' +
         'Have <b>you</b> visited goasidaio.ac.kr?abd=5hell0?5...&kkk=5rk.,. ';
 
     /**
      * @brief
     * Distill all urls including fuzzy matched ones from normal text
      * @author Andrew Kang
      * @param textStr string required
        
       
 var urls = Pattern.TextArea.extractAllFuzzyUrls(textStr)

console.log()

LIVE DEMO

Chapter 5. Highlight all URLs in texts


var sampleText = "If you visit "192.179.3.5?abc=2"..  
                   http://[::1]:8000.... "

    /**
     * @brief
     * Distill all urls
     * @author Andrew Kang
     * @param textStr string required
     * @param clsName string required
     * @param contentEditableMode boolean default false
     * @param noProtocolJsn object
     *    default :  {
                'ip_v4' : false,
                'ip_v6' : false,
                'localhost' : false,
                'intranet' : false
            }

     * @return string
     */
             
var textStr_new = Pattern.TextEditorArea.addClassToAllUrls(sampleText, 'highlighted1', false, {
                              'ip_v4' : true,
                              'ip_v6' : true,
                              'localhost' : true,
                              'intranet' : false
                          });

You can check how url patterns are highlighted by running the sample source below.

https://github.com/Andrew-Kang-G/parse-urls/blob/master/public/index.html

or LIVE DEMO

Chapter 6. Extract all URLs in raw HTML or XML

    // The sample of 'XML (HTML)'
var xmlStr =
        'en.wikipedia.org/wiki/Wikipedia:About\n' +
        '<body><p>packed1book.net?user[name][first]=tj&user[name][last]=holowaychuk</p>\n' +
        'fakeshouldnotbedetected.url?abc=fake -s5houl７十七日dbedetected.jp?japan=go- ' +
        'plus.google.co.kr0에서.., \n' +
        'https://plus.google.com/+google\n' +
        'https://www.google.com/maps/place/USA/@36.2218457,...' +
        '<img style=\' = > float : none ; height: 200px;max-width: 50%;margin-top : 3%\' alt="undefined" src="http://www.aaa가가.com/image/showWorkOrderImg?fileName=12345.png"/>\n' +
        '<!--how about adackedbooked.co.kr-the site?  请发邮件给我abc件给@navered.com ssh://www.aaa가.com" <p >--邮件给aa件给@daum.net</p> www.naver.com\n  <p style="width: 100%"></p>-->  "abc@daum.net"로 보내주세요. ' +
        '-gigi.dau.ac.kr?mac=10 -dau.ac.kr?mac=10 <p id="abc" class="def xxx gh" style="<>">abcd@daum.co.kr에서 가나다@pacbook.net<span style="color: rgb(127,127,127);">Please align the paper to the left.</span>&nbsp;</p>\n' +
        '<p> 구루.com <img style="float:none;height: 200px;margin-top : 3%" src="/image/showWorkOrderImg?fileName=123456.png" alt="undefined" abc/></p>\n' +
        'http: //ne1ver.com:8000?abc=1&dd=5 localhost:80 estonia.ee/ estonia.ee? <p class="https://www.aadc给s.cn"> 	https://flaviocopes.com/how-to-inspect-javascript-object/ ※Please ask 203.35.33.555:8000 if you have any issues! ※&nbsp;&nbsp;&nbsp;&nbsp;</p></body> Have you visited goasidaioaaa.ac.kr';
        
var urls = PatternExtractor.XmlArea.extractAllUrls(xmlStr);

console.log()

 [
// Not all listed
     {
       "value": {
         "url": "packed1book.net?user[name][first]=tj&user[name][last]=holowaychuk",
         "removedTailOnUrl": "",
         "protocol": null,
         "onlyDomain": "packed1book.net",
         "onlyParams": "?user[name][first]=tj&user[name][last]=holowaychuk",
         "onlyUri": null,
         "onlyUriWithParams": "?user[name][first]=tj&user[name][last]=holowaychuk",
         "onlyParamsJsn": {
           "user": {
             "name": {
               "first": "tj",
               "last": "holowaychuk"
             }
           }
         },
         "type": "domain",
         "port": null
       },
       "area": "text"
   },
   {
     "value": {
       "url": "adackedbooked.co.kr",
       "removedTailOnUrl": "",
       "protocol": null,
       "onlyDomain": "adackedbooked.co.kr",
       "onlyParams": null,
       "onlyUri": null,
       "onlyUriWithParams": null,
       "onlyParamsJsn": null,
       "type": "domain",
       "port": null
     },
     "area": "comment"
   }
    .....
 ]

Please inform me of more sophisticated patterns you need by leaving issues on Github or emailing me at studypurpose@naver.com.

Keywords

FAQs

What is parse-urls?

Is parse-urls popular?

Is parse-urls well maintained?

Package last updated on 05 Oct 2019

Did you know?

Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.

Install

parse-urls

Overview

URL extractor

If you can't see any results on 'jsfiddle.net', change the 'Editor layout' like this.

Fuzzy URL extractor

Installation

Syntax & Usage

Chapter 1. Normalize or parse one URL

console.log()

console.log()

Chapter 2. Extract all URLs

console.log()

Chapter 3. Extract URIs with certain names

console.log()

Chapter 4. Extract all fuzzy URLs

This does not detect intranets due to false positives. If you need to extract intranets, go to the Chapter 4. below.

console.log()

Chapter 5. Highlight all URLs in texts

Chapter 6. Extract all URLs in raw HTML or XML

console.log()

Keywords

Related posts

8 More Malicious Firefox Extensions: Exploiting Popular Game Recognition, Hijacking User Sessions, and Stealing OAuth Credentials

Official Go SDK for MCP in Development, Stable Release Expected in August