You're Invited:Meet the Socket Team at BlackHat and DEF CON in Las Vegas, Aug 4-6.RSVP
Socket
Book a DemoInstallSign in
Socket

url-reader

Package Overview
Dependencies
Maintainers
1
Versions
3
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

url-reader - npm Package Compare versions

Comparing version

to
1.0.1

.dockerignore

17

dist/core.d.ts

@@ -1,19 +0,12 @@

interface IOptions {
urls: string[];
timeout: number;
}
interface IReaderResult {
title: string;
content: string;
length: number;
textContent: string;
excerpt: string;
}
import { IOptions, IReaderResult } from './interface';
export declare class URLReader {
private browser;
private timeout;
private turndown;
constructor();
init(): Promise<void>;
read(options: IOptions): Promise<IReaderResult[]>;
private readDoc;
private html2md;
}
export {};
export default URLReader;

@@ -0,10 +1,16 @@

import { JSDOM } from 'jsdom';
import Turndown from 'turndown';
import puppeteer from 'puppeteer';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
class URLReaderError extends Error {
}
;
export class URLReader {
browser;
timeout;
turndown;
constructor() {
this.timeout = 60000;
this.browser = null;
this.turndown = new Turndown();
}

@@ -14,9 +20,11 @@ async init() {

return;
this.browser = await puppeteer.launch();
this.browser = await puppeteer.launch({
headless: true,
});
}
async read(options) {
const { urls, timeout } = options;
const { urls, timeout, enableMarkdown = true, runScripts } = options;
const { browser, timeout: defaultTimeout } = this;
if (!browser)
throw new Error('browser is null');
throw new URLReaderError('browser is null.');
const results = [];

@@ -26,3 +34,3 @@ for (const url of urls) {

const res = await page.goto(url, {
timeout: timeout || defaultTimeout
timeout: timeout ?? defaultTimeout,
});

@@ -33,13 +41,17 @@ const txt = await res?.text();

const doc = new JSDOM(txt, {
url
url,
runScripts,
});
const reader = new Readability(doc.window.document);
const article = reader.parse();
const article = await this.readDoc(doc.window.document);
let markdown = '';
if (enableMarkdown)
markdown = await this.html2md(article?.content ?? '');
if (article) {
results.push({
length: article.length,
title: article.title,
content: article.content,
length: article.length,
textContent: article.textContent,
excerpt: article.excerpt
html: article.content,
text: article.textContent,
markdown,
excerpt: article.excerpt,
});

@@ -51,2 +63,12 @@ }

}
async readDoc(doc) {
const reader = new Readability(doc);
return reader.parse();
}
async html2md(html) {
if (!html)
return '';
return this.turndown?.turndown(html);
}
}
export default URLReader;

@@ -8,2 +8,3 @@ {

"url-reader",
"urlReader",
"json",

@@ -14,3 +15,3 @@ "html",

],
"version": "1.0.0",
"version": "1.0.1",
"main": "./dist/index.js",

@@ -21,29 +22,31 @@ "author": "zac ma",

"scripts": {
"start": "node ./dist/app.js",
"dev": "nodemon --exec tsx ./src/app.ts",
"build": "tsc --project tsconfig.build.json --outDir dist/",
"lint": "yarn eslint .",
"test": "jest"
},
"engines": {
"node": ">=20.0.0"
"node": ">=20.11.0"
},
"dependencies": {
"@mozilla/readability": "^0.5.0",
"fastify": "^4.26.2",
"jsdom": "^24.0.0",
"puppeteer": "^22.6.5"
"puppeteer": "^22.6.5",
"turndown": "^7.1.3"
},
"devDependencies": {
"@eslint/eslintrc": "^3.0.2",
"@eslint/js": "^9.0.0",
"@tsconfig/recommended": "^1.0.6",
"@eslint/js": "^9.1.1",
"@stylistic/eslint-plugin": "^1.7.2",
"@types/jsdom": "^21.1.6",
"@types/node": "^20.12.7",
"@typescript-eslint/eslint-plugin": "^6.4.0",
"eslint": "^9.0.0",
"eslint-config-standard-with-typescript": "^43.0.1",
"eslint-plugin-import": "^2.25.2",
"eslint-plugin-n": "^15.0.0 || ^16.0.0 ",
"eslint-plugin-promise": "^6.0.0",
"@types/turndown": "^5.0.4",
"eslint": "^9.1.1",
"globals": "^15.0.0",
"tsx": "^4.7.3",
"typescript": "^5.4.5",
"typescript-eslint": "^7.7.0"
"typescript-eslint": "^7.7.1",
"nodemon": "^3.1.0"
}
}
# URL READER
This project allows you to read the content of a URL.
This project helps you to read the content of URLs, and return the title, length, html, text, markdown, excerpt.
> "node": ">=20.11.0"
## Installation

@@ -9,13 +11,76 @@

yarn install urlreader
# or npm install urlreader
```
### Troubleshooting
## Usage
- puppeteer
```ts
import URLReader from 'urlreader';
the puppeteer will download the chromium browser
const reader = new URLReader();
await reader.init();
const results = await reader.read({
urls: ['https://www.google.com'],
timeout: 10000, // ms, default: 60000
enableMarkdown: false, // default: true
runScripts: 'dangerously', // run the scripts included in the HTML and fetch remote resources, default is closed.
});
```
Parsed Result:
```ts
interface IReaderResult {
title: string;
length: number;
html: string;
text: string;
markdown?: string;
excerpt: string;
}
```
## Server
* start server
```bash
git clone https://github.com/yokingma/url-reader.git
cd urlReader
# default listen on port 3030
yarn run start
```
* api
```txt
GET /reader?url=https://www.google.com
POST /reader
{
urls: ['https://www.google.com', 'https://www.bing.com']
}
```
## Docker
```bash
docker build -t urlreader . # urlreader is your image's tag name
```
## Tips
- puppeteer
When you install Puppeteer, it will automatically downloads a recent version of Chrome for Testing (~170MB macOS, ~282MB Linux, ~280MB Windows) and a chrome-headless-shell binary.
## Troubleshooting
- install error with puppeteer
```txt
Error [ERR_TLS_CERT_ALTNAME_INVALID]: Hostname/IP does not match certificate's altnames...
```
remove .npmrc file and re-install.

@@ -0,21 +1,13 @@

import { JSDOM } from 'jsdom';
import Turndown from 'turndown';
import puppeteer, { type Browser } from 'puppeteer';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { IOptions, IReaderResult } from './interface';
interface IOptions {
urls: string[]
timeout: number
}
class URLReaderError extends Error {};
interface IReaderResult {
title: string
content: string
length: number
textContent: string
excerpt: string
}
export class URLReader {
private browser: null | Browser
private timeout: number
private browser: null | Browser;
private timeout: number;
private turndown: Turndown;

@@ -25,2 +17,3 @@ constructor() {

this.browser = null;
this.turndown = new Turndown();
}

@@ -30,9 +23,11 @@

if (this.browser) return;
this.browser = await puppeteer.launch();
this.browser = await puppeteer.launch({
headless: true,
});
}
public async read(options: IOptions) {
const { urls, timeout } = options;
const { urls, timeout, enableMarkdown = true, runScripts } = options;
const { browser, timeout: defaultTimeout } = this;
if (!browser) throw new Error('browser is null');
if (!browser) throw new URLReaderError('browser is null.');
const results: IReaderResult[] = [];

@@ -42,24 +37,26 @@ for (const url of urls) {

const res = await page.goto(url, {
timeout: timeout || defaultTimeout
})
timeout: timeout ?? defaultTimeout,
});
const txt = await res?.text();
if (!txt) continue;
const doc = new JSDOM(txt, {
url
url,
runScripts,
});
const reader = new Readability(doc.window.document);
const article = reader.parse();
const article = await this.readDoc(doc.window.document);
let markdown = '';
if (enableMarkdown) markdown = await this.html2md(article?.content ?? '');
if (article) {
results.push({
length: article.length,
title: article.title,
content: article.content,
length: article.length,
textContent: article.textContent,
excerpt: article.excerpt
})
html: article.content,
text: article.textContent,
markdown,
excerpt: article.excerpt,
});
}
await page.close()
await page.close();
}

@@ -69,2 +66,14 @@

}
private async readDoc(doc: Document) {
const reader = new Readability(doc);
return reader.parse();
}
private async html2md(html: string) {
if (!html) return '';
return this.turndown?.turndown(html);
}
}
export default URLReader;

@@ -1,1 +0,1 @@

export * from './core';
export * from './core';
{
"extends": "@tsconfig/recommended/tsconfig.json",
"compilerOptions": {
"rootDir": "./src",
"declaration": true,
"module": "ES2022",
"module": "ESNext",
"moduleResolution": "node",
"target": "ES2022",
"target": "ESNext",
"outDir": "dist",
"noImplicitAny": true,
"allowSyntheticDefaultImports": true,
"strict": true,
// "noEmit": true,
"noEmit": false,
"typeRoots": [

@@ -18,3 +19,4 @@ "node_modules/@types"

"src/**/*.ts",
"test/**/*.ts", "tsconfig.build.json",
"test/**/*.ts",
"tsconfig.build.json",
],

@@ -25,4 +27,5 @@ "exclude": [

"node_modules",
"jest.config.js"
"jest.config.js",
"eslint.config.js"
]
}