
Security News
Node.js Moves Toward Stable TypeScript Support with Amaro 1.0
Amaro 1.0 lays the groundwork for stable TypeScript support in Node.js, bringing official .ts loading closer to reality.
A toolkit for quickly performing crawler functions
pip install quick-crawler
Example 1:
from quick_crawler.page import *
if __name__=="__main__":
# get a html page and can save the file if the file path is assigned.
url="https://learnersdictionary.com/3000-words/alpha/a"
html_str=quick_html_page(url)
print(html_str)
# get a json object from html string
html_obj=quick_html_object(html_str)
word_list=html_obj.find("ul",{"class":"a_words"}).findAll("li")
print("word list: ")
for word in word_list:
print(word.find("a").text.replace(" ","").strip())
# get or download a series of url with similar format, like a page list
url_range="https://learnersdictionary.com/3000-words/alpha/a/{pi}"
list_html_str=quick_html_page_range(url_range,min_page=1,max_page=10)
for idx,html in enumerate(list_html_str):
html_obj = quick_html_object(html)
word_list = html_obj.find("ul", {"class": "a_words"}).findAll("li")
list_w=[]
for word in word_list:
list_w.append(word.find("a").text.replace(" ", "").strip())
print(f"Page {idx+1}: ", ','.join(list_w))
Example 2:
from quick_crawler.page import *
if __name__=="__main__":
# remove unicode str
u_str = 'aà\xb9'
u_str_removed = quick_remove_unicode(u_str)
print("Removed str: ", u_str_removed)
# get json object online
json_url="http://soundcloud.com/oembed?url=http%3A//soundcloud.com/forss/flickermood&format=json"
json_obj=quick_json_obj(json_url)
print(json_obj)
for k in json_obj:
print(k,json_obj[k])
# read a series of obj from a json list online
json_list_url = "https://jsonplaceholder.typicode.com/posts"
json_list = quick_json_obj(json_list_url)
print(json_list)
for obj in json_list:
userId = obj["userId"]
title = obj["title"]
body = obj["body"]
print(obj)
# quick save csv file from a list of json objects
quick_save_csv("news_list.csv",['userId','id','title','body'],json_list)
# quick read csv file to a list of fields
list_result=quick_read_csv("news_list.csv",fields=['userId','title'])
print(list_result)
# quick download a file
quick_download_file("https://www.englishclub.com/images/english-club-C90.png",save_file_path="logo.png")
Example 3: obtain html text from the Browser
from quick_crawler import browser
import os
if __name__=="__main__":
html_str=browser.get_html_str_with_browser("https://pypi.org/project/quick-crawler/0.0.2/",driver_path='../../examples/browsers/chromedriver.exe')
print(html_str)
Example 4: Crawl a series of web pages from a group of websites
from quick_crawler import browser
import os
list_item=[
['CNN','https://edition.cnn.com/'],
['AP','https://apnews.com/']
]
current_path = os.path.dirname(os.path.realpath(__file__))
browser.fetch_meta_info_from_sites(list_item,current_path+"/data",is_save_fulltext=True,use_plain_text=True)
Example 5: Crawl a series of websites with advanced settings
from quick_crawler import page,browser
import os
import pickle
list_item=pickle.load(open("list_news_site.pickle","rb"))[20:]
current_path = os.path.dirname(os.path.realpath(__file__))
browser.fetch_meta_info_from_sites(list_item,current_path+"/news_data1",
is_save_fulltext=True,
use_plain_text=False,
max_num_urls=100,
use_keywords=True
)
list_model=browser.summarize_downloaded_data("news_data1",
# save_path="news_data_list.csv"
)
Example 6: Multi-lang crawler
import os
from quick_crawler.multilang import get_sites_with_multi_lang_keywords
keywords="digital economy"
init_urls=[
["en-cnn","https://edition.cnn.com/"],
['jp-asahi', 'https://www.asahi.com/'],
['ru-mk', 'https://www.mk.ru/'],
['zh-xinhuanet', 'http://xinhuanet.com/'],
]
current_path = os.path.dirname(os.path.realpath(__file__))
list_item=get_sites_with_multi_lang_keywords(
init_urls=init_urls,
src_term=keywords,
src_language="en",
target_langs=["ja","zh","es","ru"],
save_data_folder=f"{current_path}/news_data3"
)
Example 7: get multiple translations based on a keyword
import pickle
from quick_crawler.language import *
terms = 'digital economy'
dict_lang=get_lang_dict_by_translation("en",terms)
pickle.dump(dict_lang,open(f"multi-lang-{terms}.pickle",'wb'))
Example 8: Pipeline for web page list processing
from quick_crawler.pipline.page_list import run_web_list_analysis_shell
if __name__=="__main__":
def find_list(html_obj):
return html_obj.find("div", {"class": "bd"}).findAll("li")
def get_item(item):
datetime = item.find("span").text
title = item.find("a").text
url = item.find("a")["href"]
return title, url, datetime
run_web_list_analysis_shell(
url_pattern="https://www.abc.com/index_{p}.html",
working_folder='test',
min_page=1,
max_page=2,
fn_find_list=find_list,
fn_get_item=get_item,
tag='xxxx'
)
The quick-crawler
project is provided by Donghua Chen.
FAQs
A toolkit for quickly performing crawler functions
We found that quick-crawler demonstrated a healthy version release cadence and project activity because the last version was released less than a year ago. It has 1 open source maintainer collaborating on the project.
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
Amaro 1.0 lays the groundwork for stable TypeScript support in Node.js, bringing official .ts loading closer to reality.
Research
A deceptive PyPI package posing as an Instagram growth tool collects user credentials and sends them to third-party bot services.
Product
Socket now supports pylock.toml, enabling secure, reproducible Python builds with advanced scanning and full alignment with PEP 751's new standard.