deshortify
Advanced tools
Comparing version 0.4.0 to 0.5.0
# v0.4.0 | ||
* Skip GDPR interstitials | ||
# v0.3.0 (2017-12-06) | ||
@@ -3,0 +7,0 @@ |
{ | ||
"name": "deshortify", | ||
"version": "0.4.0", | ||
"version": "0.5.0", | ||
"description": "Turns short URLs into long, meaningful, crap-less URLs.", | ||
@@ -17,8 +17,15 @@ "main": "dist/deshortify.js", | ||
"scripts": { | ||
"lint": "eslint src", | ||
"lintfix": "eslint src --fix", | ||
"lint": "prettier src/**.js", | ||
"lintfix": "prettier src/**.js --write", | ||
"build": "rollup -c rollup-config.js" | ||
}, | ||
"prettier": { | ||
"printWidth": 85, | ||
"tabWidth": 4, | ||
"useTabs": true, | ||
"trailingComma": "es5", | ||
"arrowParens": "always" | ||
}, | ||
"devDependencies": { | ||
"eslint": "^4.0.0", | ||
"prettier": "^1.17.1", | ||
"rollup": "^0.52.1", | ||
@@ -25,0 +32,0 @@ "rollup-plugin-buble": "^0.18.0" |
@@ -0,12 +1,10 @@ | ||
const http = require("http"); | ||
const https = require("https"); | ||
const parseUrl = require("url").parse; | ||
const resolveUrl = require("url").resolve; | ||
const formatUrl = require("url").format; | ||
const http = require('http'); | ||
const https = require('https'); | ||
const parseUrl = require('url').parse; | ||
const resolveUrl = require('url').resolve; | ||
const formatUrl = require('url').format; | ||
export default class Deshortifier { | ||
// module.exports = class Deshortifier { | ||
// module.exports = class Deshortifier { | ||
constructor(options = {}) { | ||
this._cache = {}; | ||
@@ -16,10 +14,16 @@ this._verbose = !!options.verbose; | ||
if (process) { | ||
let deshortifyVersion = require('../package.json').version; | ||
let nodejsVersion = process.release.name + '/' + process.version; | ||
let deshortifyVersion = require("../package.json").version; | ||
let nodejsVersion = process.release.name + "/" + process.version; | ||
// Nice user agent for most URLs | ||
this._userAgent = options.userAgent || | ||
('Deshortify/' + deshortifyVersion + ' ' + nodejsVersion + ' (+https://gitlab.com/IvanSanchez/deshortify)'); | ||
this._userAgent = | ||
options.userAgent || | ||
"Deshortify/" + | ||
deshortifyVersion + | ||
" " + | ||
nodejsVersion + | ||
" (+https://gitlab.com/IvanSanchez/deshortify)"; | ||
// User agent for asshole websites which filter requests based on it (I'm looking at you, facebook) | ||
this._assholeUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0.2 Safari/602.3.12"; | ||
this._assholeUserAgent = | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0.2 Safari/602.3.12"; | ||
} else { | ||
@@ -31,3 +35,2 @@ // User agent when running on browser is the same as the navigator's | ||
// Returns a Promise for the deshortified URL | ||
@@ -38,3 +41,3 @@ deshortify(url) { | ||
if (!parsedUrl.protocol) { | ||
url = 'http://' + url; | ||
url = "http://" + url; | ||
} | ||
@@ -46,9 +49,9 @@ | ||
_deshortify(url, breadcrumbs = []) { | ||
if (breadcrumbs.indexOf(url) !== -1 || // Circular loop, break it. | ||
if ( | ||
breadcrumbs.indexOf(url) !== -1 || // Circular loop, break it. | ||
breadcrumbs.length > 20 || // This looks like an infinite non-circular loop, break it. | ||
this._skipUrl(url)) // URL whitelisted from being deshortified because nuisances | ||
{ | ||
this._skipUrl(url) // URL whitelisted from being deshortified because nuisances | ||
) { | ||
if (this._verbose) { | ||
console.log('Skipping: ', url); | ||
console.log("Skipping: ", url); | ||
} | ||
@@ -61,3 +64,2 @@ return Promise.resolve(this._cleanUp(url)); | ||
if (url in this._cache) { | ||
let cachedUrl = this._cache[url]; | ||
@@ -67,3 +69,5 @@ | ||
// return the promise already created. | ||
if (cachedUrl instanceof Promise) { return cachedUrl; } | ||
if (cachedUrl instanceof Promise) { | ||
return cachedUrl; | ||
} | ||
@@ -75,3 +79,11 @@ if (cachedUrl === url) { | ||
if (this._verbose) { | ||
console.log('cached follow: ', url, ' → ', cachedUrl, ' (breadcrumbs lenght is ', breadcrumbs.length, ')'); | ||
console.log( | ||
"cached follow: ", | ||
url, | ||
" → ", | ||
cachedUrl, | ||
" (breadcrumbs lenght is ", | ||
breadcrumbs.length, | ||
")" | ||
); | ||
} | ||
@@ -81,6 +93,5 @@ return this._deshortify(cachedUrl, breadcrumbs); | ||
let parsedUrl = parseUrl(url); | ||
if (parsedUrl.protocol !== 'http:' && parsedUrl.protocol !== 'https:') { | ||
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") { | ||
// Neither HTTP or HTTPS, just return whatever (might be ftp, or something more esoteric as irc or gopher) | ||
@@ -91,41 +102,40 @@ return Promise.resolve(url); | ||
let userAgent = | ||
(parsedUrl.hostname === 'fb.me') ? | ||
this._assholeUserAgent : | ||
this._userAgent; | ||
parsedUrl.hostname === "fb.me" | ||
? this._assholeUserAgent | ||
: this._userAgent; | ||
// Handle header-based redirects | ||
return this._cache[url] = new Promise((resolve)=>{ | ||
return (this._cache[url] = new Promise((resolve) => { | ||
let backend = parsedUrl.protocol === "http:" ? http : https; | ||
let backend = | ||
parsedUrl.protocol === 'http:' ? | ||
http: | ||
https; | ||
let request = backend.request( | ||
{ | ||
method: "HEAD", | ||
protocol: parsedUrl.protocol, | ||
hostname: parsedUrl.hostname, | ||
port: parsedUrl.port, | ||
path: parsedUrl.path, | ||
headers: { | ||
"User-Agent": userAgent, | ||
}, | ||
}, | ||
(res) => { | ||
// console.log(res.headers); | ||
let request = backend.request({ | ||
method: 'HEAD', | ||
protocol: parsedUrl.protocol, | ||
hostname: parsedUrl.hostname, | ||
port: parsedUrl.port, | ||
path: parsedUrl.path, | ||
headers: { | ||
'User-Agent': userAgent | ||
} | ||
}, (res)=>{ | ||
// console.log(res.headers); | ||
if ("location" in res.headers) { | ||
// w00t! We've got a 30x response and a redirect! | ||
let newUrl = resolveUrl(url, res.headers.location); | ||
if (this._verbose && url !== newUrl) { | ||
console.log("follow: ", url, " → ", newUrl); | ||
} | ||
this._cache[url] = newUrl; | ||
return resolve(this._deshortify(newUrl, breadcrumbs)); | ||
} | ||
if ('location' in res.headers) { // w00t! We've got a 30x response and a redirect! | ||
let newUrl = resolveUrl(url, res.headers.location); | ||
if (this._verbose && (url !== newUrl)) { | ||
console.log('follow: ', url, ' → ', newUrl); | ||
} | ||
this._cache[url] = newUrl; | ||
return resolve(this._deshortify(newUrl, breadcrumbs)); | ||
// Giving up, looks like this was the final URL. | ||
return resolve(this._cleanUp(url)); | ||
} | ||
); | ||
// Giving up, looks like this was the final URL. | ||
return resolve(this._cleanUp(url)); | ||
}); | ||
request.on('error', ()=>{ | ||
request.on("error", () => { | ||
// Panic and return the original url. | ||
@@ -135,12 +145,8 @@ return resolve(this._cleanUp(url)); | ||
request.end(); // Actually send the request | ||
}); | ||
request.end(); // Actually send the request | ||
})); | ||
} | ||
// Cleans up spammy query parameters and hash bits. | ||
_cleanUp(url) { | ||
/// TODO | ||
let parsedUrl = parseUrl(url, true); | ||
@@ -153,21 +159,20 @@ let host = parsedUrl.host; | ||
if (params) { | ||
// console.log(url); | ||
params.forEach((name)=>{ | ||
// console.log(url); | ||
params.forEach((name) => { | ||
let val = parsedUrl.query[name]; | ||
// console.log(host, ' / ', name, ' = ', val); | ||
// console.log(host, ' / ', name, ' = ', val); | ||
// console.log('Matches _source: ', name.match(/_source$/)); | ||
// console.log('Matches _source: ', name.match(/_source$/)); | ||
if ( | ||
(typeof val !== 'string') || // e.g. http://www.businessinsider.com/...&r=US&IR=T&IR=T | ||
(name.match(/_source$/)) || | ||
(name.match(/_medium$/)) || | ||
(name.match(/_term$/)) || | ||
(name.match(/_content$/)) || | ||
(name.match(/_campaign$/)) || | ||
(name.match(/_mchannel/)) || | ||
(name.match(/_kwd$/)) || | ||
(name === 'utm_cid') || | ||
(name === "cm_mmc") || | ||
typeof val !== "string" || // e.g. http://www.businessinsider.com/...&r=US&IR=T&IR=T | ||
name.match(/_source$/) || | ||
name.match(/_medium$/) || | ||
name.match(/_term$/) || | ||
name.match(/_content$/) || | ||
name.match(/_campaign$/) || | ||
name.match(/_mchannel/) || | ||
name.match(/_kwd$/) || | ||
name === "utm_cid" || | ||
name === "cm_mmc" || | ||
(name === "tag" && val === "as.rss") || | ||
@@ -184,14 +189,14 @@ (name === "ref" && val === "rss") || | ||
(name === "spref" && val === "gr") || | ||
(val.match(/^twitter/)) || | ||
(val.match(/\.twitter$/)) || | ||
(val === "share_btn_tw") || | ||
val.match(/^twitter/) || | ||
val.match(/\.twitter$/) || | ||
val === "share_btn_tw" || | ||
(name === "platform" && val === "hootsuite") || | ||
(name === "mbid" && val === "social_retweet") || // New Yorker et al | ||
(name === "mbid" && val === "social_twitter") || // New Yorker et al | ||
(name === "mbid" && val === "social_retweet") || // New Yorker et al | ||
(name === "mbid" && val === "social_twitter") || // New Yorker et al | ||
(host === "www.youtube.com" && name === "feature") || | ||
(host === "www.nytimes.com" && name === "smid") || | ||
(host === "www.nytimes.com" && name === "seid") || | ||
(name === "awesm") || // Appears as a logger of awesm shortener, at least in storify | ||
(name === "CMP" && val === "twt_gu") || // Guardian.co.uk short links | ||
(name === "CMP" && val.match(/^soc_/)) || // Guardian.co.uk short links | ||
name === "awesm" || // Appears as a logger of awesm shortener, at least in storify | ||
(name === "CMP" && val === "twt_gu") || // Guardian.co.uk short links | ||
(name === "CMP" && val.match(/^soc_/)) || // Guardian.co.uk short links | ||
(name === "CMP" && val.match(/^Share_/)) || | ||
@@ -204,12 +209,14 @@ (name === "ex_cid" && val === "story-twitter") || | ||
(name === "soc_trk" && val === "tw") || | ||
(name === "hootPostID") || | ||
(name === "a" && val === "socialmedia") || // Meetup | ||
name === "hootPostID" || | ||
(name === "a" && val === "socialmedia") || // Meetup | ||
(host.match(/medium.com$/) && name === "source") || | ||
(host.match(/elpais.com$/) && name === "id_externo_rsoc") || | ||
(host.match(/washingtonpost.com$$/) && name === "postshare") || | ||
(host.match(/washingtonpost.com$$/) && name === "ss_tw-bottom") || | ||
(val === "rss-default") || | ||
(name === "__twitter_impression") || | ||
(name === 'src' && val === "syn")) | ||
{ | ||
(host.match(/washingtonpost.com$$/) && | ||
name === "ss_tw-bottom") || | ||
val === "rss-default" || | ||
name === "__twitter_impression" || | ||
(name === "src" && val === "syn") || | ||
name === "fbclid" | ||
) { | ||
// Noop | ||
@@ -223,3 +230,3 @@ return; | ||
// console.log('cleanup: ', JSON.stringify(parsedUrl.query), ' → ', JSON.stringify(cleanedParams) ); | ||
// console.log('cleanup: ', JSON.stringify(parsedUrl.query), ' → ', JSON.stringify(cleanedParams) ); | ||
@@ -232,6 +239,6 @@ // Replace query params and delete duplicated stuff | ||
// console.log(parsedUrl); | ||
// console.log(parsedUrl); | ||
var cleanedUrl = formatUrl(parsedUrl); | ||
if (this._verbose && (url !== cleanedUrl)) { | ||
console.log('cleanup: ', url, ' → ', cleanedUrl ); | ||
if (this._verbose && url !== cleanedUrl) { | ||
console.log("cleanup: ", url, " → ", cleanedUrl); | ||
} | ||
@@ -241,7 +248,5 @@ return cleanedUrl; | ||
// Returns boolean true if the passed URL should be skipped altogether, | ||
// false otherwise | ||
_skipUrl(url) { | ||
let parsedUrl = parseUrl(url, true); | ||
@@ -252,30 +257,24 @@ | ||
return ( | ||
host === "youtu.be" || // Does not add more info | ||
host === "spoti.fi" || // Does not add more info | ||
host === "4sq.com" || // Does not add more info | ||
host === "flic.kr" || // Does not add more info | ||
host === "untp.beer" || // Does not add more info | ||
host === "youtube.com" || // Does not add more info | ||
host === "www.elmundo.es" || // El Mundo newspaper will only timeout and waste time | ||
host === "www.economist.com" || // "You are banned from this site. Please contact via a different client configuration if you believe that this is a mistake." | ||
host === "pbs.twimg.com" || // Might trigger verbose errors if twitter is over capacity | ||
// host === "www.linkedin.com" || // Used to redirect to login | ||
host === "session.wikispaces.com" || // Infinite redirect loop with different URLs params each time | ||
host.match(/twitter\.com$/) || // Infinite redirect to login | ||
host.match(/blogspot\.[a-z]{2-3}$/) || // Will redirect to a nearby geolocated server | ||
host.match(/facebook\.com$/) || // Will redirect to fb.com/unsupportedbrowser due to user-agent | ||
host.match(/\.nytimes\.com$/) || // Infinite nocookies loop | ||
host.match(/^www\.amazon\.$/) || // 405 MethodNotAllowed | ||
url.match(/subscribe/) || // Potential paywall | ||
url.match(/nocookie/) || // Potential paywall/login | ||
url.match(/gdpr/) // Potential "GDPR consent" interstitial | ||
url.length > 400 || // Just too long, maybe a REPL | ||
host === "youtu.be" || // Does not add more info | ||
host === "spoti.fi" || // Does not add more info | ||
host === "4sq.com" || // Does not add more info | ||
host === "flic.kr" || // Does not add more info | ||
host === "untp.beer" || // Does not add more info | ||
host === "youtube.com" || // Does not add more info | ||
host === "www.elmundo.es" || // El Mundo newspaper will only timeout and waste time | ||
host === "www.economist.com" || // "You are banned from this site. Please contact via a different client configuration if you believe that this is a mistake." | ||
host === "pbs.twimg.com" || // Might trigger verbose errors if twitter is over capacity | ||
// host === "www.linkedin.com" || // Used to redirect to login | ||
host === "session.wikispaces.com" || // Infinite redirect loop with different URLs params each time | ||
host.match(/twitter\.com$/) || // Infinite redirect to login | ||
host.match(/blogspot\.[a-z]{2-3}$/) || // Will redirect to a nearby geolocated server | ||
host.match(/facebook\.com$/) || // Will redirect to fb.com/unsupportedbrowser due to user-agent | ||
host.match(/\.nytimes\.com$/) || // Infinite nocookies loop | ||
host.match(/^www\.amazon\.$/) || // 405 MethodNotAllowed | ||
url.match(/subscribe/) || // Potential paywall | ||
url.match(/nocookie/) || // Potential paywall/login | ||
url.match(/gdpr/) // Potential "GDPR consent" interstitial | ||
); | ||
} | ||
} | ||
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 2 instances in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 2 instances in 1 package
40097
499
5