PDF Reader
In a nutshell PDF Reader it is a simple Go library for reading PDF files which enables text exctraction being in the form of Plain Text
or Formatted Text
. The very first developer of this library it was https://github.com/rsc/pdf, being forked and improved by https://github.com/ledongthuc/pdf. Cloudresty has forked ledongthuc's library with the aim to maintain and improve it further.
Features
- Get plain text content (without format)
- Get content (including all font and formatting information)
Install Read PDF Go library
go get -u github.com/cloudresty/pdf
Read PDF Go Library Examples
Go Read PDF - Plain Text
package main
import (
"bytes"
"fmt"
"github.com/cloudresty/pdf"
)
func main() {
pdf.DebugOn = true
pdfText, errReadPDF := readPDF("file.pdf")
if errReadPDF != nil {
panic(errReadPDF)
}
fmt.Println(pdfText)
return
}
func readPDF(path string) (string, error) {
f, r, errPDFOpen := pdf.Open(path)
defer f.Close()
if errPDFOpen != nil {
return "", errPDFOpen
}
var buf bytes.Buffer
b, errGetPlainText := r.GetPlainText()
if errGetPlainText != nil {
return "", errGetPlainText
}
buf.ReadFrom(b)
return buf.String(), nil
}
Go Read PDF - Formatted Text
func readPDF(path string) (string, error) {
f, r, errPDFOpen := pdf.Open(path)
defer f.Close()
if errPDFOpen != nil {
return "", errPDFOpen
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
var lastTextStyle pdf.Text
texts := p.Content().Text
for _, text := range texts {
if isSameSentence(text, lastTextStyle) {
lastTextStyle.S = lastTextStyle.S + text.S
} else {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
lastTextStyle = text
}
}
}
return "", nil
}
Go Read PDF - Text Grouped by Rows
package main
import (
"fmt"
"os"
"github.com/cloudresty/pdf"
)
func main() {
content, errReadPDF := readPDF(os.Args[1])
if errReadPDF != nil {
panic(errReadPDF)
}
fmt.Println(content)
return
}
func readPDF(path string) (string, error) {
f, r, errReadPDF := pdf.Open(path)
defer func() {
_ = f.Close()
}()
if errReadPDF != nil {
return "", errReadPDF
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
rows, _ := p.GetTextByRow()
for _, row := range rows {
println("Row: ", row.Position)
for _, word := range row.Content {
fmt.Println(word.S)
}
}
}
return "", nil
}