PDF Reader

A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf
Features
- Get plain text content (without format)
- Get Content (including all font and formatting information)
Install:
go get -u github.com/ledongthuc/pdf
Examples:
- Check in examples/ folder
Read plain text
package main
import (
"bytes"
"fmt"
"github.com/ledongthuc/pdf"
)
func main() {
pdf.DebugOn = true
f, r, err := pdf.Open("./pdf_test.pdf")
if err != nil {
panic(err)
}
defer f.Close()
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
panic(err)
}
buf.ReadFrom(b)
content := buf.String()
fmt.Println(content)
}
Read all text with styles from PDF
package main
import (
"fmt"
"github.com/ledongthuc/pdf"
)
func main() {
f, r, err := pdf.Open("./pdf_test.pdf")
if err != nil {
panic(err)
}
defer f.Close()
sentences, err := r.GetStyledTexts()
if err != nil {
panic(err)
}
for _, sentence := range sentences {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n",
sentence.Font,
sentence.FontSize,
sentence.X,
sentence.Y,
sentence.S)
}
}
Read text grouped by rows
package main
import (
"fmt"
"os"
"github.com/ledongthuc/pdf"
)
func main() {
content, err := readPdf(os.Args[1])
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
f, r, err := pdf.Open(path)
defer func() {
_ = f.Close()
}()
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() || p.V.Key("Contents").Kind() == pdf.Null {
continue
}
rows, _ := p.GetTextByRow()
for _, row := range rows {
println(">>>> row: ", row.Position)
for _, word := range row.Content {
fmt.Println(word.S)
}
}
}
return "", nil
}
Demo
