confidencial-ni-node
Advanced tools
Comparing version 0.0.9 to 0.0.10
@@ -8,3 +8,5 @@ // Load modules | ||
var async = require('async'); | ||
var jquery = fs.readFileSync(require('path').resolve(__dirname, 'jquery.min.js')).toString(); | ||
var jquery = fs.readFileSync( | ||
require('path').resolve(__dirname, 'jquery.min.js') | ||
).toString(); | ||
@@ -31,196 +33,180 @@ // confidencial categories | ||
// article scraping data | ||
var parseArticleOptions = { | ||
domain: "http://www.confidencial.com.ni/", | ||
domain: 'http://www.confidencial.com.ni/', | ||
elements: [ | ||
{ | ||
name: 'title', | ||
sel: function ($) { | ||
var result = $('#articleheader h2').text().trim(); | ||
return ( !S(result).isEmpty() ) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'title_sub', | ||
sel: function($) { | ||
var result = $('#articleheader h3').text().trim(); | ||
return ( !S(result).isEmpty() ) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'title_paragraph', | ||
sel: function($) { | ||
var result = $('#articleheader p.bold').text().trim(); | ||
return ( !S(result).isEmpty() ) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'author', | ||
sel: function($) { | ||
var result = $('#articleheader p.authorname').text().trim().split("|")[0].trim(); | ||
return ( !S(result).isEmpty() ) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'date', | ||
sel: function($) { | ||
var result = $('#articleheader p.authorname').text(); | ||
if( !S(result).isEmpty() ){ | ||
result = result.match( /(\d{1,2}\/\d{1,2}\/\d{4})/g ); | ||
} | ||
{ | ||
name: 'title', | ||
sel: function ($) { | ||
var result = $('#articleheader h2').text().trim(); | ||
return (!S(result).isEmpty()) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'title_sub', | ||
sel: function($) { | ||
var result = $('#articleheader h3').text().trim(); | ||
return (!S(result).isEmpty()) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'title_paragraph', | ||
sel: function($) { | ||
var result = $('#articleheader p.bold').text().trim(); | ||
return (!S(result).isEmpty()) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'author', | ||
sel: function($) { | ||
var result = $('#articleheader p.authorname').text(). | ||
trim().split('|')[0].trim(); | ||
return (!S(result).isEmpty()) ? result : ''; | ||
} | ||
}, | ||
{ | ||
name: 'date', | ||
sel: function($) { | ||
var result = $('#articleheader p.authorname').text(); | ||
if (!S(result).isEmpty()) { | ||
result = result.match(/(\d{1,2}\/\d{1,2}\/\d{4})/g); | ||
} | ||
return (!S(result).isEmpty()) ? S(result).left(9).s : ''; | ||
} | ||
}, | ||
{ | ||
name: 'images', | ||
sel: function($) { | ||
var result = []; | ||
var array = $('article img').map(function() { | ||
return $(this).attr('src'); | ||
}).get(); | ||
array.forEach(function(item) { | ||
result.push(parseArticleOptions.domain + item); | ||
}); | ||
return (!S(result).isEmpty()) ? result : []; | ||
} | ||
}, | ||
{ | ||
name: 'category', | ||
sel: function($) { | ||
var result = $('#quicknav').text(); | ||
if (!S(result).isEmpty()) { | ||
result = S(result).trim().s; | ||
result = S(result).replaceAll('Confidencial', '').s; | ||
result = S(result).replaceAll('Leer artículo', '').s; | ||
result = S(result).replaceAll('»', '').s; | ||
result = S(result).trim().s; | ||
} | ||
return (!S(result).isEmpty()) ? S(result).trim().s : ''; | ||
} | ||
}, | ||
{ | ||
name: 'content', | ||
sel: function($) { | ||
var newResult = ''; | ||
var result = $('article div.content_article div.text_article').html(); | ||
if (!S(result).isEmpty()) { | ||
result = result.trim(); | ||
newResult = S(result).stripTags('p,br,strong').s; | ||
newResult = S(newResult).collapseWhitespace().s; | ||
newResult = S(newResult).replaceAll('<br />', '\n').s; | ||
newResult = S(newResult).replaceAll('<br /> <br />', '\n').s; | ||
newResult = S(newResult).replaceAll('<br /><br />', '\n').s; | ||
newResult = S(newResult).replaceAll('<p>', '').s; | ||
newResult = S(newResult).replaceAll('</p>', '\n').s; | ||
newResult = S(newResult).replaceAll(' ', '').s; | ||
newResult = S(newResult.replace(/<a.*href="(.*?)".*>(.*?)<\/a>/gi, '$2 $1')).trim().s; | ||
newResult = newResult.replace(/<(?:.|\n)*?>/gm, ''); | ||
if (S(newResult).endsWith('\n')) { | ||
return ( !S(result).isEmpty() ) ? S(result).left(9).s : ''; | ||
} | ||
}, | ||
{ | ||
name: 'images', | ||
sel: function($) { | ||
var result = []; | ||
var array = $("article img").map(function() { | ||
return $(this).attr("src"); | ||
}).get(); | ||
array.forEach(function(item){ | ||
result.push( parseArticleOptions.domain + item ); | ||
}); | ||
return ( !S(result).isEmpty() ) ? result : []; | ||
} | ||
}, | ||
{ | ||
name: 'category', | ||
sel: function($) { | ||
var result = $("#quicknav").text(); | ||
if( !S(result).isEmpty() ){ | ||
result = S(result).trim().s; | ||
result = S(result).replaceAll('Confidencial', '').s; | ||
result = S(result).replaceAll('Leer artículo', '').s; | ||
result = S(result).replaceAll('»', '').s; | ||
result = S(result).trim().s; | ||
} | ||
return ( !S(result).isEmpty() ) ? S(result).trim().s : ''; | ||
} | ||
}, | ||
{ | ||
name: 'content', | ||
sel: function($) { | ||
var new_result = ""; | ||
var result = $("article div.content_article div.text_article").html(); | ||
if( !S(result).isEmpty() ){ | ||
result = result.trim(); | ||
new_result = S( result ).stripTags('p,br,strong').s; | ||
new_result = S( new_result ).collapseWhitespace().s; | ||
new_result = S( new_result ).replaceAll('<br />', '\n').s; | ||
new_result = S( new_result ).replaceAll('<br /> <br />', '\n').s; | ||
new_result = S( new_result ).replaceAll('<br /><br />', '\n').s; | ||
new_result = S( new_result ).replaceAll('<p>', '').s; | ||
new_result = S( new_result ).replaceAll('</p>', '\n').s; | ||
new_result = S( new_result ).replaceAll(' ', '').s; | ||
new_result = S( new_result.replace(/<a.*href="(.*?)".*>(.*?)<\/a>/gi, "$2 $1") ).trim().s; | ||
new_result = new_result.replace(/<(?:.|\n)*?>/gm, ''); | ||
if( S( new_result).endsWith('\n') ){ | ||
} | ||
} | ||
return ( !S(result).isEmpty() ) ? S(new_result).trim().s : ''; | ||
} | ||
} | ||
return (!S(result).isEmpty()) ? S(newResult).trim().s : ''; | ||
} | ||
} | ||
] | ||
}; | ||
}; | ||
// simple function to compare if exist item in array | ||
function inArray(needle, haystack) { | ||
var length = haystack.length; | ||
for(var i = 0; i < length; i++) { | ||
if(haystack[i] == needle) return true; | ||
var length = haystack.length; | ||
for (var i = 0; i < length; i++) { | ||
if (haystack[i] == needle) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
return false; | ||
} | ||
// function to parse data | ||
function parse(site, callback) { | ||
jsdom.env({ | ||
url: site, | ||
scripts: ["http://code.jquery.com/jquery.js"], | ||
done: function (errors, window) { | ||
var $ = window.$; | ||
callback($, errors); | ||
} | ||
}); | ||
jsdom.env({ | ||
url: site, | ||
scripts: ['http://code.jquery.com/jquery.js'], | ||
done: function (errors, window) { | ||
var $ = window.$; | ||
callback($, errors); | ||
} | ||
}); | ||
}; | ||
// Global function to get all links from all categories | ||
module.exports.getAllLinks = function(callback){ | ||
function out(){ | ||
var result = []; | ||
module.exports.getAllLinks = function(callback) { | ||
function out() { | ||
var result = []; | ||
var fetch = function(url,cb){ | ||
request(url, function(err,response,body){ | ||
if ( err ){ | ||
cb( err ); | ||
} else { | ||
cb( null, body ); // First param indicates error, null=> no error | ||
} | ||
}); | ||
var fetch = function(url, cb) { | ||
request(url, function(err, response, body) { | ||
if (err) { | ||
cb(err); | ||
} | ||
else { | ||
cb(null, body); // First param indicates error, null=> no error | ||
} | ||
}); | ||
} | ||
async.map(categories, fetch, function(err, results){ | ||
if ( err){ | ||
console.log(err); | ||
// either file1, file2 or file3 has raised an error, so you should not use results and handle the error | ||
} else { | ||
results.forEach(function(category){ | ||
$ = cheerio.load(category); | ||
var links = $('.article h3 a,article h2 a'); //use your CSS selector here | ||
$(links).each(function(i, link){ | ||
var article_url = 'http://confidencial.com.ni/' + $(link).attr('href'); | ||
if( !inArray( article_url, result ) ){ | ||
result.push( article_url ); | ||
} | ||
}); | ||
}); | ||
callback(result); | ||
async.map(categories, fetch, function(err, results) { | ||
if (err) { | ||
console.log(err); | ||
// either file1, file2 or file3 has raised an error, so you should not use results and handle the error | ||
} else { | ||
results.forEach(function(category) { | ||
$ = cheerio.load(category); | ||
var links = $('.article h3 a,article h2 a'); //use your CSS selector here | ||
$(links).each(function(i, link) { | ||
var articleUrl = 'http://confidencial.com.ni/' + | ||
$(link).attr('href'); | ||
if (!inArray(articleUrl, result)) { | ||
result.push(articleUrl); | ||
} | ||
}); | ||
}); | ||
} | ||
out(); | ||
callback(result); | ||
} | ||
}); | ||
} | ||
out(); | ||
} | ||
// global function to get data from article url | ||
module.exports.getArticle = function(site, cb){ | ||
module.exports.getArticle = function(site, cb) { | ||
function getId(url) { | ||
var id = url.split('/'); | ||
return id[4]; | ||
} | ||
function getId(url){ | ||
var id = url.split('/'); | ||
return id[4]; | ||
} | ||
function out(url) { | ||
parse(url, function ($, err) { | ||
var result = {}; | ||
result.id = getId(url); | ||
result.url = url; | ||
parseArticleOptions.elements.forEach(function (elem) { | ||
result[elem.name] = elem.sel($); | ||
}); | ||
cb(result, err); | ||
}); | ||
} | ||
out(site); | ||
function out(url) { | ||
parse(url, function ($, err) { | ||
var result = {}; | ||
result.id = getId(url); | ||
result.url = url; | ||
parseArticleOptions.elements.forEach(function (elem) { | ||
result[elem.name] = elem.sel($); | ||
}); | ||
cb(result, err); | ||
}); | ||
} | ||
out(site); | ||
} |
{ | ||
"name": "confidencial-ni-node" | ||
, "description": "Web scraping http://www.confidencial.com.ni/" | ||
, "version": "0.0.9" | ||
, "version": "0.0.10" | ||
, "author": "Paulo McNally <paulomcnally@gmail.com>" | ||
@@ -24,2 +24,2 @@ , "keywords": ["nicaragua", "diario", "news"] | ||
} | ||
} | ||
} |
Sorry, the diff of this file is not supported yet
9
574
103001