Skip to content

Latest commit

 

History

History
78 lines (62 loc) · 1.43 KB

README.md

File metadata and controls

78 lines (62 loc) · 1.43 KB

PDF Reader

A simple Go library which enables reading PDF files.

Features

  • Get plain text content (without format)
  • Get Content (including all font and formatting information)

Install:

go get -u github.com/rsc/pdf

Read plain text

package main

import (
	"bytes"
	"fmt"

	"github.com/rsc/pdf"
)

func main() {
	content, err := readPdf("test.pdf") // Read local pdf file
	if err != nil {
		panic(err)
	}
	fmt.Println(content)
	return
}

func readPdf(path string) (string, error) {
	r, err := pdf.Open(path)
	if err != nil {
		return "", err
	}

	var buf bytes.Buffer
	buf.ReadFrom(p.GetPlainText())
	return buf.String(), nil
}

Read all text with styles from PDF

func readPdf2(path string) (string, error) {
	r, err := pdf.Open(path)
	if err != nil {
		return "", err
	}
	totalPage := r.NumPage()

	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
		p := r.Page(pageIndex)
		if p.V.IsNull() {
			continue
		}
		var lastTextStyle pdf.Text
		texts := p.Content().Text
		for _, text := range texts {
			if isSameSentence(text, lastTextStyle) {
				lastTextStyle.S = lastTextStyle.S + text.S
			} else {
				fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
				lastTextStyle = text
			}
		}
	}
	return "", nil
}

Demo

Run example