Purpose of the fork
This fork of rsc.io/pdf extends the package API with:
- Implement the method GetPlainText() from object Page. Use to get plain text content (without format)
Installing this fork
In your project, install via go modules:
go get github.com/CerviTalotekniikka/pdf@latest
Release instructions
After making changes, please tag the commit with semantic version prefixed with 'v'
git tag
git tag v0.2.0
git push
git push --tags
How to read all text from PDF:
-
Get the library with command go get -u github.com/ledongthuc/pdf
-
I write an example function to read file from PATH and return the content of PDF
package main
import (
"bytes"
"fmt"
"github.com/ledongthuc/pdf"
)
func main() {
content, err := readPdf("test.pdf")
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
totalPage := r.NumPage()
var textBuilder bytes.Buffer
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
textBuilder.WriteString(p.GetPlainText("\n"))
}
return textBuilder.String(), nil
}
How to read all text with styles from PDF
func readPdf2(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
var lastTextStyle pdf.Text
texts := p.Content().Text
for _, text := range texts {
if isSameSentence(text, lastTextStyle) {
lastTextStyle.S = lastTextStyle.S + text.S
} else {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
lastTextStyle = text
}
}
}
return "", nil
}
Demo