Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix getEncoder when both ToUnicode and Encoding keys are present #1

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
336 changes: 178 additions & 158 deletions page.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func (r *Reader) GetPlainText() (io.Reader, error) {
pages := r.NumPage()
var buf bytes.Buffer
fonts := make(map[string]*Font)
for i := 1; i < pages; i++ {
for i := 1; i <= pages; i++ {
p := r.Page(i)
for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap
if _, ok := fonts[name]; !ok {
Expand Down Expand Up @@ -170,6 +170,10 @@ func (f *Font) Encoder() TextEncoding {
}

func (f Font) getEncoder() TextEncoding {
if !f.V.Key("ToUnicode").IsNull() {
return f.charmapEncoding()
}

enc := f.V.Key("Encoding")
switch enc.Kind() {
case Name:
Expand Down Expand Up @@ -532,7 +536,6 @@ func (p Page) GetPlainText(fonts map[string]*Font) (io.Reader, error) {

// Content returns the page's content.
func (p Page) Content() (Content, error) {
strm := p.V.Key("Contents")
var enc TextEncoding = &nopEncoder{}

var g = gstate{
Expand Down Expand Up @@ -565,185 +568,202 @@ func (p Page) Content() (Content, error) {

var rect []Rect
var gstack []gstate
err := Interpret(strm, func(stk *Stack, op string) error {
n := stk.Len()
args := make([]Value, n)
for i := n - 1; i >= 0; i-- {
args[i] = stk.Pop()
}
switch op {
default:
//fmt.Println(op, args)
return nil

case "cm": // update g.CTM
if len(args) != 6 {
return errors.New("bad g.Tm")
}
var m matrix
for i := 0; i < 6; i++ {
m[i/2][i%2] = args[i].Float64()
}
m[2][2] = 1
g.CTM = m.mul(g.CTM)

case "gs": // set parameters from graphics state resource
gs := p.Resources().Key("ExtGState").Key(args[0].Name())
font := gs.Key("Font")
if font.Kind() == Array && font.Len() == 2 {
//fmt.Println("FONT", font)
}

case "f": // fill
case "g": // setgray
case "l": // lineto
case "m": // moveto
var strms []Value
contents := p.V.Key("Contents")
switch contents.Kind() {
case Stream:
strms = append(strms, contents)
case Array:
for i := 0; i < contents.Len(); i++ {
strms = append(strms, contents.Index(i))
}
default:
return Content{}, errors.New("expected page contents to be a stream or an array")
}

case "cs": // set colorspace non-stroking
case "scn": // set color non-stroking
for _, strm := range strms {
err := Interpret(strm, func(stk *Stack, op string) error {
n := stk.Len()
args := make([]Value, n)
for i := n - 1; i >= 0; i-- {
args[i] = stk.Pop()
}
switch op {
default:
//fmt.Println(op, args)
return nil

case "cm": // update g.CTM
if len(args) != 6 {
return errors.New("bad g.Tm")
}
var m matrix
for i := 0; i < 6; i++ {
m[i/2][i%2] = args[i].Float64()
}
m[2][2] = 1
g.CTM = m.mul(g.CTM)

case "gs": // set parameters from graphics state resource
gs := p.Resources().Key("ExtGState").Key(args[0].Name())
font := gs.Key("Font")
if font.Kind() == Array && font.Len() == 2 {
//fmt.Println("FONT", font)
}

case "re": // append rectangle to path
if len(args) != 4 {
return errors.New("bad re")
}
x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64()
rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}})
case "f": // fill
case "g": // setgray
case "l": // lineto
case "m": // moveto

case "q": // save graphics state
gstack = append(gstack, g)
case "cs": // set colorspace non-stroking
case "scn": // set color non-stroking

case "Q": // restore graphics state
n := len(gstack) - 1
g = gstack[n]
gstack = gstack[:n]
case "re": // append rectangle to path
if len(args) != 4 {
return errors.New("bad re")
}
x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64()
rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}})

case "BT": // begin text (reset text matrix and line matrix)
g.Tm = ident
g.Tlm = g.Tm
case "q": // save graphics state
gstack = append(gstack, g)

case "ET": // end text
case "Q": // restore graphics state
n := len(gstack) - 1
g = gstack[n]
gstack = gstack[:n]

case "T*": // move to start of next line
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
case "BT": // begin text (reset text matrix and line matrix)
g.Tm = ident
g.Tlm = g.Tm

case "Tc": // set character spacing
if len(args) != 1 {
return errors.New("bad g.Tc")
}
g.Tc = args[0].Float64()
case "ET": // end text

case "TD": // move text position and set leading
if len(args) != 2 {
return errors.New("bad Td")
}
g.Tl = -args[1].Float64()
fallthrough
case "Td": // move text position
if len(args) != 2 {
return errors.New("bad Td")
}
tx := args[0].Float64()
ty := args[1].Float64()
x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
case "T*": // move to start of next line
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm

case "Tf": // set text font and size
if len(args) != 2 {
return errors.New("bad TL")
}
f := args[0].Name()
g.Tf = p.Font(f)
enc = g.Tf.Encoder()
if enc == nil {
println("no cmap for", f)
enc = &nopEncoder{}
}
g.Tfs = args[1].Float64()
case "Tc": // set character spacing
if len(args) != 1 {
return errors.New("bad g.Tc")
}
g.Tc = args[0].Float64()

case "\"": // set spacing, move to next line, and show text
if len(args) != 3 {
return errors.New("bad \" operator")
}
g.Tw = args[0].Float64()
g.Tc = args[1].Float64()
args = args[2:]
fallthrough
case "'": // move to next line and show text
if len(args) != 1 {
return errors.New("bad ' operator")
}
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
fallthrough
case "Tj": // show text
if len(args) != 1 {
return errors.New("bad Tj operator")
}
showText(args[0].RawString())
case "TD": // move text position and set leading
if len(args) != 2 {
return errors.New("bad Td")
}
g.Tl = -args[1].Float64()
fallthrough
case "Td": // move text position
if len(args) != 2 {
return errors.New("bad Td")
}
tx := args[0].Float64()
ty := args[1].Float64()
x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm

case "Tf": // set text font and size
if len(args) != 2 {
return errors.New("bad TL")
}
f := args[0].Name()
g.Tf = p.Font(f)
enc = g.Tf.Encoder()
if enc == nil {
println("no cmap for", f)
enc = &nopEncoder{}
}
g.Tfs = args[1].Float64()

case "TJ": // show text, allowing individual glyph positioning
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
showText(x.RawString())
} else {
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
case "\"": // set spacing, move to next line, and show text
if len(args) != 3 {
return errors.New("bad \" operator")
}
g.Tw = args[0].Float64()
g.Tc = args[1].Float64()
args = args[2:]
fallthrough
case "'": // move to next line and show text
if len(args) != 1 {
return errors.New("bad ' operator")
}
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
fallthrough
case "Tj": // show text
if len(args) != 1 {
return errors.New("bad Tj operator")
}
showText(args[0].RawString())

case "TJ": // show text, allowing individual glyph positioning
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
showText(x.RawString())
} else {
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
}
}

case "TL": // set text leading
if len(args) != 1 {
return errors.New("bad TL")
}
g.Tl = args[0].Float64()
case "TL": // set text leading
if len(args) != 1 {
return errors.New("bad TL")
}
g.Tl = args[0].Float64()

case "Tm": // set text matrix and line matrix
if len(args) != 6 {
return errors.New("bad g.Tm")
}
var m matrix
for i := 0; i < 6; i++ {
m[i/2][i%2] = args[i].Float64()
}
m[2][2] = 1
g.Tm = m
g.Tlm = m
case "Tm": // set text matrix and line matrix
if len(args) != 6 {
return errors.New("bad g.Tm")
}
var m matrix
for i := 0; i < 6; i++ {
m[i/2][i%2] = args[i].Float64()
}
m[2][2] = 1
g.Tm = m
g.Tlm = m

case "Tr": // set text rendering mode
if len(args) != 1 {
return errors.New("bad Tr")
}
g.Tmode = int(args[0].Int64())
case "Tr": // set text rendering mode
if len(args) != 1 {
return errors.New("bad Tr")
}
g.Tmode = int(args[0].Int64())

case "Ts": // set text rise
if len(args) != 1 {
return errors.New("bad Ts")
}
g.Trise = args[0].Float64()
case "Ts": // set text rise
if len(args) != 1 {
return errors.New("bad Ts")
}
g.Trise = args[0].Float64()

case "Tw": // set word spacing
if len(args) != 1 {
return errors.New("bad g.Tw")
}
g.Tw = args[0].Float64()
case "Tw": // set word spacing
if len(args) != 1 {
return errors.New("bad g.Tw")
}
g.Tw = args[0].Float64()

case "Tz": // set horizontal text scaling
if len(args) != 1 {
return errors.New("bad Tz")
case "Tz": // set horizontal text scaling
if len(args) != 1 {
return errors.New("bad Tz")
}
g.Th = args[0].Float64() / 100
}
g.Th = args[0].Float64() / 100
return nil
})
if err != nil {
return Content{}, err
}
return nil
})
if err != nil {
return Content{}, err
}

return Content{text, rect}, nil
}

Expand Down