Comparing version
@@ -1,19 +0,12 @@ | ||
interface IOptions { | ||
urls: string[]; | ||
timeout: number; | ||
} | ||
interface IReaderResult { | ||
title: string; | ||
content: string; | ||
length: number; | ||
textContent: string; | ||
excerpt: string; | ||
} | ||
import { IOptions, IReaderResult } from './interface'; | ||
export declare class URLReader { | ||
private browser; | ||
private timeout; | ||
private turndown; | ||
constructor(); | ||
init(): Promise<void>; | ||
read(options: IOptions): Promise<IReaderResult[]>; | ||
private readDoc; | ||
private html2md; | ||
} | ||
export {}; | ||
export default URLReader; |
@@ -0,10 +1,16 @@ | ||
import { JSDOM } from 'jsdom'; | ||
import Turndown from 'turndown'; | ||
import puppeteer from 'puppeteer'; | ||
import { Readability } from '@mozilla/readability'; | ||
import { JSDOM } from 'jsdom'; | ||
class URLReaderError extends Error { | ||
} | ||
; | ||
export class URLReader { | ||
browser; | ||
timeout; | ||
turndown; | ||
constructor() { | ||
this.timeout = 60000; | ||
this.browser = null; | ||
this.turndown = new Turndown(); | ||
} | ||
@@ -14,9 +20,11 @@ async init() { | ||
return; | ||
this.browser = await puppeteer.launch(); | ||
this.browser = await puppeteer.launch({ | ||
headless: true, | ||
}); | ||
} | ||
async read(options) { | ||
const { urls, timeout } = options; | ||
const { urls, timeout, enableMarkdown = true, runScripts } = options; | ||
const { browser, timeout: defaultTimeout } = this; | ||
if (!browser) | ||
throw new Error('browser is null'); | ||
throw new URLReaderError('browser is null.'); | ||
const results = []; | ||
@@ -26,3 +34,3 @@ for (const url of urls) { | ||
const res = await page.goto(url, { | ||
timeout: timeout || defaultTimeout | ||
timeout: timeout ?? defaultTimeout, | ||
}); | ||
@@ -33,13 +41,17 @@ const txt = await res?.text(); | ||
const doc = new JSDOM(txt, { | ||
url | ||
url, | ||
runScripts, | ||
}); | ||
const reader = new Readability(doc.window.document); | ||
const article = reader.parse(); | ||
const article = await this.readDoc(doc.window.document); | ||
let markdown = ''; | ||
if (enableMarkdown) | ||
markdown = await this.html2md(article?.content ?? ''); | ||
if (article) { | ||
results.push({ | ||
length: article.length, | ||
title: article.title, | ||
content: article.content, | ||
length: article.length, | ||
textContent: article.textContent, | ||
excerpt: article.excerpt | ||
html: article.content, | ||
text: article.textContent, | ||
markdown, | ||
excerpt: article.excerpt, | ||
}); | ||
@@ -51,2 +63,12 @@ } | ||
} | ||
async readDoc(doc) { | ||
const reader = new Readability(doc); | ||
return reader.parse(); | ||
} | ||
async html2md(html) { | ||
if (!html) | ||
return ''; | ||
return this.turndown?.turndown(html); | ||
} | ||
} | ||
export default URLReader; |
@@ -8,2 +8,3 @@ { | ||
"url-reader", | ||
"urlReader", | ||
"json", | ||
@@ -14,3 +15,3 @@ "html", | ||
], | ||
"version": "1.0.0", | ||
"version": "1.0.1", | ||
"main": "./dist/index.js", | ||
@@ -21,29 +22,31 @@ "author": "zac ma", | ||
"scripts": { | ||
"start": "node ./dist/app.js", | ||
"dev": "nodemon --exec tsx ./src/app.ts", | ||
"build": "tsc --project tsconfig.build.json --outDir dist/", | ||
"lint": "yarn eslint .", | ||
"test": "jest" | ||
}, | ||
"engines": { | ||
"node": ">=20.0.0" | ||
"node": ">=20.11.0" | ||
}, | ||
"dependencies": { | ||
"@mozilla/readability": "^0.5.0", | ||
"fastify": "^4.26.2", | ||
"jsdom": "^24.0.0", | ||
"puppeteer": "^22.6.5" | ||
"puppeteer": "^22.6.5", | ||
"turndown": "^7.1.3" | ||
}, | ||
"devDependencies": { | ||
"@eslint/eslintrc": "^3.0.2", | ||
"@eslint/js": "^9.0.0", | ||
"@tsconfig/recommended": "^1.0.6", | ||
"@eslint/js": "^9.1.1", | ||
"@stylistic/eslint-plugin": "^1.7.2", | ||
"@types/jsdom": "^21.1.6", | ||
"@types/node": "^20.12.7", | ||
"@typescript-eslint/eslint-plugin": "^6.4.0", | ||
"eslint": "^9.0.0", | ||
"eslint-config-standard-with-typescript": "^43.0.1", | ||
"eslint-plugin-import": "^2.25.2", | ||
"eslint-plugin-n": "^15.0.0 || ^16.0.0 ", | ||
"eslint-plugin-promise": "^6.0.0", | ||
"@types/turndown": "^5.0.4", | ||
"eslint": "^9.1.1", | ||
"globals": "^15.0.0", | ||
"tsx": "^4.7.3", | ||
"typescript": "^5.4.5", | ||
"typescript-eslint": "^7.7.0" | ||
"typescript-eslint": "^7.7.1", | ||
"nodemon": "^3.1.0" | ||
} | ||
} |
# URL READER | ||
This project allows you to read the content of a URL. | ||
This project helps you to read the content of URLs, and return the title, length, html, text, markdown, excerpt. | ||
> "node": ">=20.11.0" | ||
## Installation | ||
@@ -9,13 +11,76 @@ | ||
yarn install urlreader | ||
# or npm install urlreader | ||
``` | ||
### Troubleshooting | ||
## Usage | ||
- puppeteer | ||
```ts | ||
import URLReader from 'urlreader'; | ||
the puppeteer will download the chromium browser | ||
const reader = new URLReader(); | ||
await reader.init(); | ||
const results = await reader.read({ | ||
urls: ['https://www.google.com'], | ||
timeout: 10000, // ms, default: 60000 | ||
enableMarkdown: false, // default: true | ||
runScripts: 'dangerously', // run the scripts included in the HTML and fetch remote resources, default is closed. | ||
}); | ||
``` | ||
Parsed Result: | ||
```ts | ||
interface IReaderResult { | ||
title: string; | ||
length: number; | ||
html: string; | ||
text: string; | ||
markdown?: string; | ||
excerpt: string; | ||
} | ||
``` | ||
## Server | ||
* start server | ||
```bash | ||
git clone https://github.com/yokingma/url-reader.git | ||
cd urlReader | ||
# default listen on port 3030 | ||
yarn run start | ||
``` | ||
* api | ||
```txt | ||
GET /reader?url=https://www.google.com | ||
POST /reader | ||
{ | ||
urls: ['https://www.google.com', 'https://www.bing.com'] | ||
} | ||
``` | ||
## Docker | ||
```bash | ||
docker build -t urlreader . # urlreader is your image's tag name | ||
``` | ||
## Tips | ||
- puppeteer | ||
When you install Puppeteer, it will automatically downloads a recent version of Chrome for Testing (~170MB macOS, ~282MB Linux, ~280MB Windows) and a chrome-headless-shell binary. | ||
## Troubleshooting | ||
- install error with puppeteer | ||
```txt | ||
Error [ERR_TLS_CERT_ALTNAME_INVALID]: Hostname/IP does not match certificate's altnames... | ||
``` | ||
remove .npmrc file and re-install. |
@@ -0,21 +1,13 @@ | ||
import { JSDOM } from 'jsdom'; | ||
import Turndown from 'turndown'; | ||
import puppeteer, { type Browser } from 'puppeteer'; | ||
import { Readability } from '@mozilla/readability'; | ||
import { JSDOM } from 'jsdom'; | ||
import { IOptions, IReaderResult } from './interface'; | ||
interface IOptions { | ||
urls: string[] | ||
timeout: number | ||
} | ||
class URLReaderError extends Error {}; | ||
interface IReaderResult { | ||
title: string | ||
content: string | ||
length: number | ||
textContent: string | ||
excerpt: string | ||
} | ||
export class URLReader { | ||
private browser: null | Browser | ||
private timeout: number | ||
private browser: null | Browser; | ||
private timeout: number; | ||
private turndown: Turndown; | ||
@@ -25,2 +17,3 @@ constructor() { | ||
this.browser = null; | ||
this.turndown = new Turndown(); | ||
} | ||
@@ -30,9 +23,11 @@ | ||
if (this.browser) return; | ||
this.browser = await puppeteer.launch(); | ||
this.browser = await puppeteer.launch({ | ||
headless: true, | ||
}); | ||
} | ||
public async read(options: IOptions) { | ||
const { urls, timeout } = options; | ||
const { urls, timeout, enableMarkdown = true, runScripts } = options; | ||
const { browser, timeout: defaultTimeout } = this; | ||
if (!browser) throw new Error('browser is null'); | ||
if (!browser) throw new URLReaderError('browser is null.'); | ||
const results: IReaderResult[] = []; | ||
@@ -42,24 +37,26 @@ for (const url of urls) { | ||
const res = await page.goto(url, { | ||
timeout: timeout || defaultTimeout | ||
}) | ||
timeout: timeout ?? defaultTimeout, | ||
}); | ||
const txt = await res?.text(); | ||
if (!txt) continue; | ||
const doc = new JSDOM(txt, { | ||
url | ||
url, | ||
runScripts, | ||
}); | ||
const reader = new Readability(doc.window.document); | ||
const article = reader.parse(); | ||
const article = await this.readDoc(doc.window.document); | ||
let markdown = ''; | ||
if (enableMarkdown) markdown = await this.html2md(article?.content ?? ''); | ||
if (article) { | ||
results.push({ | ||
length: article.length, | ||
title: article.title, | ||
content: article.content, | ||
length: article.length, | ||
textContent: article.textContent, | ||
excerpt: article.excerpt | ||
}) | ||
html: article.content, | ||
text: article.textContent, | ||
markdown, | ||
excerpt: article.excerpt, | ||
}); | ||
} | ||
await page.close() | ||
await page.close(); | ||
} | ||
@@ -69,2 +66,14 @@ | ||
} | ||
private async readDoc(doc: Document) { | ||
const reader = new Readability(doc); | ||
return reader.parse(); | ||
} | ||
private async html2md(html: string) { | ||
if (!html) return ''; | ||
return this.turndown?.turndown(html); | ||
} | ||
} | ||
export default URLReader; |
@@ -1,1 +0,1 @@ | ||
export * from './core'; | ||
export * from './core'; |
{ | ||
"extends": "@tsconfig/recommended/tsconfig.json", | ||
"compilerOptions": { | ||
"rootDir": "./src", | ||
"declaration": true, | ||
"module": "ES2022", | ||
"module": "ESNext", | ||
"moduleResolution": "node", | ||
"target": "ES2022", | ||
"target": "ESNext", | ||
"outDir": "dist", | ||
"noImplicitAny": true, | ||
"allowSyntheticDefaultImports": true, | ||
"strict": true, | ||
// "noEmit": true, | ||
"noEmit": false, | ||
"typeRoots": [ | ||
@@ -18,3 +19,4 @@ "node_modules/@types" | ||
"src/**/*.ts", | ||
"test/**/*.ts", "tsconfig.build.json", | ||
"test/**/*.ts", | ||
"tsconfig.build.json", | ||
], | ||
@@ -25,4 +27,5 @@ "exclude": [ | ||
"node_modules", | ||
"jest.config.js" | ||
"jest.config.js", | ||
"eslint.config.js" | ||
] | ||
} |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
13632
121.08%11
-21.43%20
81.82%343
83.42%86
309.52%5
66.67%2
100%1
Infinity%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added