scrape
A simple, higher level interface for Go web scraping.
When scraping with Go, I find myself redefining tree traversal and other
utility functions.
This package is a place to put some simple tools which build on top of the
Go HTML parsing library.
For the full interface check out the godoc
Sample
Scrape defines traversal functions like Find
and FindAll
while attempting
to be generic. It also defines convenience functions such as Attr
and Text
.
root, err := html.Parse(resp.Body)
if err != nil {
}
title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
if ok {
fmt.Println(scrape.Text(title))
}
A full example: Scraping Hacker News
package main
import (
"fmt"
"net/http"
"github.com/yhat/scrape"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
func main() {
resp, err := http.Get("https://news.ycombinator.com/")
if err != nil {
panic(err)
}
root, err := html.Parse(resp.Body)
if err != nil {
panic(err)
}
matcher := func(n *html.Node) bool {
if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
return scrape.Attr(n.Parent.Parent, "class") == "athing"
}
return false
}
articles := scrape.FindAll(root, matcher)
for i, article := range articles {
fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
}
}