instagram-screen-scrape
Advanced tools
Comparing version 1.0.1 to 2.0.0
#!/usr/bin/env node | ||
try { | ||
require('coffee-script/register'); | ||
// in production, this will fail if coffeescript isn't installed, but the | ||
// coffee is compiled anyway, so it doesn't matter | ||
} catch(e){} | ||
require('../lib/cli'); |
@@ -1,29 +0,50 @@ | ||
// Generated by CoffeeScript 1.9.3 | ||
(function() { | ||
var ArgumentParser, InstagramPosts, JSONStream, argparser, argv, packageInfo; | ||
// Generated by CoffeeScript 1.10.0 | ||
var ArgumentParser, InstagramComments, InstagramPosts, JSONStream, argparser, argv, packageInfo, subcommand, subparser; | ||
InstagramPosts = require('./'); | ||
InstagramPosts = require('./posts'); | ||
packageInfo = require('../package'); | ||
InstagramComments = require('./comments'); | ||
ArgumentParser = require('argparse').ArgumentParser; | ||
packageInfo = require('../package'); | ||
JSONStream = require('JSONStream'); | ||
ArgumentParser = require('argparse').ArgumentParser; | ||
argparser = new ArgumentParser({ | ||
version: packageInfo.version, | ||
addHelp: true, | ||
description: packageInfo.description | ||
}); | ||
JSONStream = require('JSONStream'); | ||
argparser.addArgument(['--username', '-u'], { | ||
type: 'string', | ||
help: 'Username of the account to scrape', | ||
required: true | ||
}); | ||
argparser = new ArgumentParser({ | ||
version: packageInfo.version, | ||
addHelp: true, | ||
description: packageInfo.description | ||
}); | ||
argv = argparser.parseArgs(); | ||
subparser = argparser.addSubparsers({ | ||
dest: 'subcommand' | ||
}); | ||
(new InstagramPosts(argv)).pipe(JSONStream.stringify('[', ',\n', ']\n')).pipe(process.stdout); | ||
subcommand = subparser.addParser('comments', { | ||
description: 'Scrape comments for a given post', | ||
addHelp: true | ||
}); | ||
}).call(this); | ||
subcommand.addArgument(['-p', '--post'], { | ||
type: 'string', | ||
help: 'Alphanumeric post id to scrape. This is unique across all of Instagram (so the username does not need to be specified when this option is used), and the id can be gotten from Instagram URLs with the format `instagram.com/p/<post id>`.' | ||
}); | ||
subcommand = subparser.addParser('posts', { | ||
description: 'Scrape posts by username or post id', | ||
addHelp: true | ||
}); | ||
subcommand.addArgument(['-u', '--username'], { | ||
type: 'string', | ||
help: 'Username of the account to scrape.' | ||
}); | ||
argv = argparser.parseArgs(); | ||
subcommand = argv.subcommand; | ||
delete argv.subcommand; | ||
(subcommand === 'posts' ? new InstagramPosts(argv) : new InstagramComments(argv)).pipe(JSONStream.stringify('[', ',\n', ']\n')).pipe(process.stdout); |
144
lib/index.js
@@ -1,139 +0,5 @@ | ||
// Generated by CoffeeScript 1.9.3 | ||
(function() { | ||
var InstagramPosts, Readable, getPosts, jsonRequest, | ||
bind = function(fn, me){ return function(){ return fn.apply(me, arguments); }; }, | ||
extend = function(child, parent) { for (var key in parent) { if (hasProp.call(parent, key)) child[key] = parent[key]; } function ctor() { this.constructor = child; } ctor.prototype = parent.prototype; child.prototype = new ctor(); child.__super__ = parent.prototype; return child; }, | ||
hasProp = {}.hasOwnProperty; | ||
Readable = require('readable-stream').Readable; | ||
jsonRequest = require('./util').jsonRequest; | ||
/** | ||
* Make a request for a Instagram page, parse the response, and get all the | ||
posts. | ||
* @param {String} username | ||
* @param {String} [startingId] The maximum post id query for (the lowest one | ||
from the last request), or undefined if this is the first request. | ||
* @return {Stream} A stream of posts | ||
*/ | ||
getPosts = function(username, startingId) { | ||
return jsonRequest('items.*', { | ||
uri: "https://instagram.com/" + username + "/media/", | ||
qs: { | ||
'max_id': startingId | ||
} | ||
}); | ||
}; | ||
/** | ||
* Stream that scrapes as many posts as possible for a given user. | ||
* @param {String} options.username | ||
* @return {Stream} A stream of post objects. | ||
*/ | ||
InstagramPosts = (function(superClass) { | ||
extend(InstagramPosts, superClass); | ||
InstagramPosts.prototype._lock = false; | ||
InstagramPosts.prototype._minPostId = void 0; | ||
function InstagramPosts(arg) { | ||
this.username = arg.username; | ||
this.destroy = bind(this.destroy, this); | ||
this._read = bind(this._read, this); | ||
InstagramPosts.__super__.constructor.call(this, { | ||
highWaterMark: 16, | ||
objectMode: true | ||
}); | ||
this._readableState.destroyed = false; | ||
} | ||
InstagramPosts.prototype._read = function() { | ||
var hasMorePosts, lastPost; | ||
if (this._lock) { | ||
return; | ||
} | ||
this._lock = true; | ||
if (this._readableState.destroyed) { | ||
this.push(null); | ||
return; | ||
} | ||
hasMorePosts = false; | ||
lastPost = void 0; | ||
return getPosts(this.username, this._minPostId).on('error', (function(_this) { | ||
return function(err) { | ||
return _this.emit('error', err); | ||
}; | ||
})(this)).on('data', (function(_this) { | ||
return function(rawPost) { | ||
var post; | ||
hasMorePosts = true; | ||
post = { | ||
id: rawPost.code, | ||
username: _this.username, | ||
time: +rawPost['created_time'], | ||
type: rawPost.type, | ||
like: rawPost.likes.count, | ||
comment: rawPost.comments.count | ||
}; | ||
if (rawPost.caption != null) { | ||
post.text = rawPost.caption.text; | ||
} | ||
if (rawPost.images != null) { | ||
post.image = rawPost.images['standard_resolution'].url; | ||
} | ||
if (rawPost.videos != null) { | ||
post.video = rawPost.videos['standard_resolution'].url; | ||
} | ||
_this._minPostId = rawPost.id; | ||
if (lastPost != null) { | ||
_this.push(lastPost); | ||
} | ||
return lastPost = post; | ||
}; | ||
})(this)).on('end', (function(_this) { | ||
return function() { | ||
if (hasMorePosts) { | ||
_this._lock = false; | ||
} | ||
if (lastPost != null) { | ||
_this.push(lastPost); | ||
} | ||
if (!hasMorePosts) { | ||
return _this.push(null); | ||
} | ||
}; | ||
})(this)); | ||
}; | ||
InstagramPosts.prototype.destroy = function() { | ||
if (this._readableState.destroyed) { | ||
return; | ||
} | ||
this._readableState.destroyed = true; | ||
return this._destroy((function(_this) { | ||
return function(err) { | ||
if (err) { | ||
_this.emit('error', err); | ||
} | ||
return _this.emit('close'); | ||
}; | ||
})(this)); | ||
}; | ||
InstagramPosts.prototype._destroy = function(cb) { | ||
return process.nextTick(cb); | ||
}; | ||
return InstagramPosts; | ||
})(Readable); | ||
module.exports = InstagramPosts; | ||
}).call(this); | ||
// Generated by CoffeeScript 1.10.0 | ||
module.exports = { | ||
InstagramPosts: require('./posts'), | ||
InstagramComments: require('./comments') | ||
}; |
@@ -12,4 +12,5 @@ { | ||
"time": { | ||
"type": "integer", | ||
"description": "UNIX time at which the post was made" | ||
"description": "UNIX time at which the post was made", | ||
"minimum": 0, | ||
"type": "integer" | ||
}, | ||
@@ -23,7 +24,7 @@ "type": { | ||
}, | ||
"like": { | ||
"likes": { | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"comment": { | ||
"comments": { | ||
"type": "integer", | ||
@@ -35,9 +36,5 @@ "minimum": 0 | ||
}, | ||
"image": { | ||
"media": { | ||
"type": "string", | ||
"format": "uri" | ||
}, | ||
"video": { | ||
"type": "string", | ||
"format": "uri" | ||
} | ||
@@ -53,4 +50,5 @@ }, | ||
"comment", | ||
"text" | ||
"text", | ||
"media" | ||
] | ||
} |
@@ -1,36 +0,33 @@ | ||
// Generated by CoffeeScript 1.9.3 | ||
(function() { | ||
var JSONStream, jsonRequest, request, zlib; | ||
// Generated by CoffeeScript 1.10.0 | ||
var JSONStream, jsonRequest, request, zlib; | ||
request = require('request'); | ||
request = require('request'); | ||
JSONStream = require('JSONStream'); | ||
JSONStream = require('JSONStream'); | ||
zlib = require('zlib'); | ||
zlib = require('zlib'); | ||
jsonRequest = function(jsonSelector, options) { | ||
var outStream; | ||
outStream = JSONStream.parse(jsonSelector); | ||
options.gzip = true; | ||
request(options).on('response', function(response) { | ||
var encoding, gunzip, ref; | ||
if (response.statusCode === 200) { | ||
encoding = (ref = response.headers['content-encoding']) != null ? ref.trim().toLowerCase() : void 0; | ||
if (encoding === 'gzip') { | ||
gunzip = zlib.createGunzip(); | ||
return response.pipe(gunzip).pipe(outStream); | ||
} else { | ||
return response.pipe(outStream); | ||
} | ||
jsonRequest = function(jsonSelector, options) { | ||
var outStream; | ||
outStream = JSONStream.parse(jsonSelector); | ||
options.gzip = true; | ||
request(options).on('response', function(response) { | ||
var encoding, gunzip, ref; | ||
if (response.statusCode === 200) { | ||
encoding = (ref = response.headers['content-encoding']) != null ? ref.trim().toLowerCase() : void 0; | ||
if (encoding === 'gzip') { | ||
gunzip = zlib.createGunzip(); | ||
return response.pipe(gunzip).pipe(outStream); | ||
} else { | ||
throw new Error("Instagram returned status code: " + response.statusCode); | ||
return response.pipe(outStream); | ||
} | ||
}); | ||
return outStream; | ||
}; | ||
} else { | ||
return outStream.emit('error', "Instagram returned status code: " + response.statusCode); | ||
} | ||
}); | ||
return outStream; | ||
}; | ||
module.exports = { | ||
jsonRequest: jsonRequest | ||
}; | ||
}).call(this); | ||
module.exports = { | ||
jsonRequest: jsonRequest | ||
}; |
{ | ||
"name": "instagram-screen-scrape", | ||
"description": "scrape public instagram data w/out API access", | ||
"version": "1.0.1", | ||
"version": "2.0.0", | ||
"author": "Sean Lang <slang800@gmail.com>", | ||
@@ -14,12 +14,13 @@ "bin": { | ||
"JSONStream": "^0.10.0", | ||
"argparse": "^1.0.2", | ||
"readable-stream": "^1.0.33", | ||
"request": "^2.55.0" | ||
"argparse": "^1.0.7", | ||
"readable-stream": "^2.0.4", | ||
"request": "^2.65.0", | ||
"tough-cookie": "^2.2.1" | ||
}, | ||
"devDependencies": { | ||
"coffee-script": "^1.9.1", | ||
"coffee-script": "^1.10.0", | ||
"isstream": "^0.1.2", | ||
"json-schema": "^0.2.2", | ||
"mocha": "^2.2.4", | ||
"should": "^5.2.0" | ||
"mocha": "^2.3.4", | ||
"should": "^7.1.1" | ||
}, | ||
@@ -26,0 +27,0 @@ "homepage": "https://github.com/slang800/instagram-screen-scrape", |
# Instagram Screen Scrape | ||
[![Build Status](http://img.shields.io/travis/slang800/instagram-screen-scrape.svg?style=flat-square)](https://travis-ci.org/slang800/instagram-screen-scrape) [![NPM version](http://img.shields.io/npm/v/instagram-screen-scrape.svg?style=flat-square)](https://www.npmjs.org/package/instagram-screen-scrape) [![NPM license](http://img.shields.io/npm/l/instagram-screen-scrape.svg?style=flat-square)](https://www.npmjs.org/package/instagram-screen-scrape) | ||
A tool for scraping public data from Instagram, without needing to get permission from Instagram. It can (theoretically) scrape anything that a non-logged-in user can see. But, right now it only supports getting posts for a given username. | ||
A tool for scraping public data from Instagram, without needing to get permission from Instagram. It can (theoretically) scrape anything that a non-logged-in user can see. But, right now it only supports getting posts for a given username or comments for a given post. | ||
@@ -11,12 +11,22 @@ ## Example | ||
```bash | ||
$ instagram-screen-scrape --username carrotcreative | ||
[{"id":"0toxcII4Eo","username":"carrotcreative","time":1427420497,"type":"image","like":82,"comment":3,"text":"Our CTO, @kylemac, speaking on the #LetsTalkCulture panel tonight @paperlesspost.","image":"https://scontent.cdninstagram.com/hphotos-xaf1/t51.2885-15/e15/11055816_398297847022038_803876945_n.jpg"}, | ||
{"id":"0qPcnuI4Pr","username":"carrotcreative","time":1427306556,"type":"image","like":80,"comment":4,"text":"#bitchesbebakin took it to another level today for @nporteschaikin and @slang800's #Carrotversaries today.","image":"https://scontent.cdninstagram.com/hphotos-xaf1/t51.2885-15/e15/10959049_1546104325652055_1320782099_n.jpg"}, | ||
{"id":"0WLnjlo4Ft","username":"carrotcreative","time":1426633460,"type":"image","like":61,"comment":1,"text":"T-shirts speak louder than words. Come find us @sxsw.","image":"https://scontent.cdninstagram.com/hphotos-xfa1/t51.2885-15/e15/11032904_789885121108568_378908081_n.jpg"}, | ||
$ instagram-screen-scrape posts --username carrotcreative | ||
[{"id":"0toxcII4Eo","username":"carrotcreative","time":1427420497,"type":"image","likes":82,"comments":3,"text":"Our CTO, @kylemac, speaking on the #LetsTalkCulture panel tonight @paperlesspost.","media":"https://scontent.cdninstagram.com/hphotos-xaf1/t51.2885-15/e15/11055816_398297847022038_803876945_n.jpg"}, | ||
{"id":"0qPcnuI4Pr","username":"carrotcreative","time":1427306556,"type":"image","likes":80,"comments":4,"text":"#bitchesbebakin took it to another level today for @nporteschaikin and @slang800's #Carrotversaries today.","media":"https://scontent.cdninstagram.com/hphotos-xaf1/t51.2885-15/e15/10959049_1546104325652055_1320782099_n.jpg"}, | ||
{"id":"0WLnjlo4Ft","username":"carrotcreative","time":1426633460,"type":"image","likes":61,"comments":1,"text":"T-shirts speak louder than words. Come find us @sxsw.","media":"https://scontent.cdninstagram.com/hphotos-xfa1/t51.2885-15/e15/11032904_789885121108568_378908081_n.jpg"}, | ||
``` | ||
We can also scrape comments: | ||
```bash | ||
$ instagram-screen-scrape comments --post 0qPcnuI4Pr | ||
[{"id":"948651188581269518","username":"johnlustina","time":1427308055,"text":"@margeauxlustina"}, | ||
{"id":"948682633420963943","username":"rita_xo","time":1427311804,"text":"👌@emilykalen"}, | ||
{"id":"948734454231433861","username":"david_berkhin","time":1427317981,"text":"looks so good!"}, | ||
{"id":"948824521079751272","username":"k.kate","time":1427328718,"text":"Macarons or a Petri dish full of cells? ¯\\_(ツ)_/¯"}] | ||
``` | ||
By default, there is 1 line per post, making it easy to pipe into other tools. The following example uses `wc -l` to count how many posts are returned. As you can see, I don't post much. | ||
```bash | ||
$ instagram-screen-scrape -u slang800 | wc -l | ||
$ instagram-screen-scrape posts -u slang800 | wc -l | ||
2 | ||
@@ -29,3 +39,3 @@ ``` | ||
```coffee | ||
InstagramPosts = require 'instagram-screen-scrape' | ||
{InstagramPosts} = require 'instagram-screen-scrape' | ||
@@ -36,6 +46,5 @@ # create the stream | ||
# do something interesting with the stream | ||
streamOfPosts.on('readable', -> | ||
streamOfPosts.on('data', (post) -> | ||
# since it's an object-mode stream, we get objects from it and don't need to | ||
# parse JSON or anything. | ||
post = streamOfPosts.read() | ||
# parse JSON or anything | ||
@@ -48,3 +57,3 @@ # the time field is represented in UNIX time | ||
console.log "slang800's post from #{time.toLocaleDateString()} got | ||
#{post.like} like(s), and #{post.comment} comment(s)" | ||
#{post.likes} like(s), and #{post.comments} comment(s)" | ||
) | ||
@@ -57,3 +66,3 @@ ``` | ||
var InstagramPosts, streamOfPosts; | ||
InstagramPosts = require('instagram-screen-scrape'); | ||
InstagramPosts = require('instagram-screen-scrape').InstagramPosts; | ||
@@ -64,6 +73,4 @@ streamOfPosts = new InstagramPosts({ | ||
streamOfPosts.on('readable', function() { | ||
var post, time; | ||
post = streamOfPosts.read(); | ||
time = new Date(post.time * 1000); | ||
streamOfPosts.on('data', function(post) { | ||
var time = new Date(post.time * 1000); | ||
console.log([ | ||
@@ -73,5 +80,5 @@ "slang800's post from ", | ||
" got ", | ||
post.like, | ||
post.likes, | ||
" like(s), and ", | ||
post.comment, | ||
post.comments, | ||
" comment(s)" | ||
@@ -82,2 +89,19 @@ ].join('')); | ||
And we can scrape comments in a similar manner (shown in CoffeeScript): | ||
```coffee | ||
{InstagramComments} = require 'instagram-screen-scrape' | ||
streamOfComments = new InstagramComments(post: '0qPcnuI4Pr') | ||
# do something interesting with the stream | ||
streamOfComments.on('data', (comment) -> | ||
# the time field is represented in UNIX time | ||
time = new Date(comment.time * 1000) | ||
console.log "#{comment.username} commented on #{time.toLocaleDateString()}: | ||
#{comment.text}" | ||
) | ||
``` | ||
## Why? | ||
@@ -84,0 +108,0 @@ The fact that Instagram requires an app to be registered just to access the data that is publicly available on their site is excessively controlling. Scripts should be able to consume the same data as people, and with the same level of authentication. Sadly, Instagram doesn't provide an open, structured, and machine readable API. |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
54582
12
423
108
5
1
+ Addedtough-cookie@^2.2.1
+ Addedisarray@1.0.0(transitive)
+ Addedprocess-nextick-args@2.0.1(transitive)
+ Addedreadable-stream@2.3.8(transitive)
+ Addedsafe-buffer@5.1.2(transitive)
+ Addedstring_decoder@1.1.1(transitive)
+ Addedutil-deprecate@1.0.2(transitive)
- Removedisarray@0.0.1(transitive)
- Removedreadable-stream@1.1.14(transitive)
- Removedsafe-buffer@5.2.1(transitive)
- Removedstring_decoder@0.10.31(transitive)
Updatedargparse@^1.0.7
Updatedreadable-stream@^2.0.4
Updatedrequest@^2.65.0