diff --git a/page.go b/page.go index e6cc3ad..a271d1b 100644 --- a/page.go +++ b/page.go @@ -64,7 +64,7 @@ func (r *Reader) GetPlainText() (io.Reader, error) { pages := r.NumPage() var buf bytes.Buffer fonts := make(map[string]*Font) - for i := 1; i < pages; i++ { + for i := 1; i <= pages; i++ { p := r.Page(i) for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap if _, ok := fonts[name]; !ok { @@ -170,6 +170,10 @@ func (f *Font) Encoder() TextEncoding { } func (f Font) getEncoder() TextEncoding { + if !f.V.Key("ToUnicode").IsNull() { + return f.charmapEncoding() + } + enc := f.V.Key("Encoding") switch enc.Kind() { case Name: @@ -532,7 +536,6 @@ func (p Page) GetPlainText(fonts map[string]*Font) (io.Reader, error) { // Content returns the page's content. func (p Page) Content() (Content, error) { - strm := p.V.Key("Contents") var enc TextEncoding = &nopEncoder{} var g = gstate{ @@ -565,185 +568,202 @@ func (p Page) Content() (Content, error) { var rect []Rect var gstack []gstate - err := Interpret(strm, func(stk *Stack, op string) error { - n := stk.Len() - args := make([]Value, n) - for i := n - 1; i >= 0; i-- { - args[i] = stk.Pop() - } - switch op { - default: - //fmt.Println(op, args) - return nil - - case "cm": // update g.CTM - if len(args) != 6 { - return errors.New("bad g.Tm") - } - var m matrix - for i := 0; i < 6; i++ { - m[i/2][i%2] = args[i].Float64() - } - m[2][2] = 1 - g.CTM = m.mul(g.CTM) - - case "gs": // set parameters from graphics state resource - gs := p.Resources().Key("ExtGState").Key(args[0].Name()) - font := gs.Key("Font") - if font.Kind() == Array && font.Len() == 2 { - //fmt.Println("FONT", font) - } - case "f": // fill - case "g": // setgray - case "l": // lineto - case "m": // moveto + var strms []Value + contents := p.V.Key("Contents") + switch contents.Kind() { + case Stream: + strms = append(strms, contents) + case Array: + for i := 0; i < contents.Len(); i++ { + strms = append(strms, contents.Index(i)) + } + default: + return Content{}, errors.New("expected page contents to be a stream or an array") + } - case "cs": // set colorspace non-stroking - case "scn": // set color non-stroking + for _, strm := range strms { + err := Interpret(strm, func(stk *Stack, op string) error { + n := stk.Len() + args := make([]Value, n) + for i := n - 1; i >= 0; i-- { + args[i] = stk.Pop() + } + switch op { + default: + //fmt.Println(op, args) + return nil + + case "cm": // update g.CTM + if len(args) != 6 { + return errors.New("bad g.Tm") + } + var m matrix + for i := 0; i < 6; i++ { + m[i/2][i%2] = args[i].Float64() + } + m[2][2] = 1 + g.CTM = m.mul(g.CTM) + + case "gs": // set parameters from graphics state resource + gs := p.Resources().Key("ExtGState").Key(args[0].Name()) + font := gs.Key("Font") + if font.Kind() == Array && font.Len() == 2 { + //fmt.Println("FONT", font) + } - case "re": // append rectangle to path - if len(args) != 4 { - return errors.New("bad re") - } - x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64() - rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}}) + case "f": // fill + case "g": // setgray + case "l": // lineto + case "m": // moveto - case "q": // save graphics state - gstack = append(gstack, g) + case "cs": // set colorspace non-stroking + case "scn": // set color non-stroking - case "Q": // restore graphics state - n := len(gstack) - 1 - g = gstack[n] - gstack = gstack[:n] + case "re": // append rectangle to path + if len(args) != 4 { + return errors.New("bad re") + } + x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64() + rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}}) - case "BT": // begin text (reset text matrix and line matrix) - g.Tm = ident - g.Tlm = g.Tm + case "q": // save graphics state + gstack = append(gstack, g) - case "ET": // end text + case "Q": // restore graphics state + n := len(gstack) - 1 + g = gstack[n] + gstack = gstack[:n] - case "T*": // move to start of next line - x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} - g.Tlm = x.mul(g.Tlm) - g.Tm = g.Tlm + case "BT": // begin text (reset text matrix and line matrix) + g.Tm = ident + g.Tlm = g.Tm - case "Tc": // set character spacing - if len(args) != 1 { - return errors.New("bad g.Tc") - } - g.Tc = args[0].Float64() + case "ET": // end text - case "TD": // move text position and set leading - if len(args) != 2 { - return errors.New("bad Td") - } - g.Tl = -args[1].Float64() - fallthrough - case "Td": // move text position - if len(args) != 2 { - return errors.New("bad Td") - } - tx := args[0].Float64() - ty := args[1].Float64() - x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}} - g.Tlm = x.mul(g.Tlm) - g.Tm = g.Tlm + case "T*": // move to start of next line + x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} + g.Tlm = x.mul(g.Tlm) + g.Tm = g.Tlm - case "Tf": // set text font and size - if len(args) != 2 { - return errors.New("bad TL") - } - f := args[0].Name() - g.Tf = p.Font(f) - enc = g.Tf.Encoder() - if enc == nil { - println("no cmap for", f) - enc = &nopEncoder{} - } - g.Tfs = args[1].Float64() + case "Tc": // set character spacing + if len(args) != 1 { + return errors.New("bad g.Tc") + } + g.Tc = args[0].Float64() - case "\"": // set spacing, move to next line, and show text - if len(args) != 3 { - return errors.New("bad \" operator") - } - g.Tw = args[0].Float64() - g.Tc = args[1].Float64() - args = args[2:] - fallthrough - case "'": // move to next line and show text - if len(args) != 1 { - return errors.New("bad ' operator") - } - x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} - g.Tlm = x.mul(g.Tlm) - g.Tm = g.Tlm - fallthrough - case "Tj": // show text - if len(args) != 1 { - return errors.New("bad Tj operator") - } - showText(args[0].RawString()) + case "TD": // move text position and set leading + if len(args) != 2 { + return errors.New("bad Td") + } + g.Tl = -args[1].Float64() + fallthrough + case "Td": // move text position + if len(args) != 2 { + return errors.New("bad Td") + } + tx := args[0].Float64() + ty := args[1].Float64() + x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}} + g.Tlm = x.mul(g.Tlm) + g.Tm = g.Tlm + + case "Tf": // set text font and size + if len(args) != 2 { + return errors.New("bad TL") + } + f := args[0].Name() + g.Tf = p.Font(f) + enc = g.Tf.Encoder() + if enc == nil { + println("no cmap for", f) + enc = &nopEncoder{} + } + g.Tfs = args[1].Float64() - case "TJ": // show text, allowing individual glyph positioning - v := args[0] - for i := 0; i < v.Len(); i++ { - x := v.Index(i) - if x.Kind() == String { - showText(x.RawString()) - } else { - tx := -x.Float64() / 1000 * g.Tfs * g.Th - g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) + case "\"": // set spacing, move to next line, and show text + if len(args) != 3 { + return errors.New("bad \" operator") + } + g.Tw = args[0].Float64() + g.Tc = args[1].Float64() + args = args[2:] + fallthrough + case "'": // move to next line and show text + if len(args) != 1 { + return errors.New("bad ' operator") + } + x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} + g.Tlm = x.mul(g.Tlm) + g.Tm = g.Tlm + fallthrough + case "Tj": // show text + if len(args) != 1 { + return errors.New("bad Tj operator") + } + showText(args[0].RawString()) + + case "TJ": // show text, allowing individual glyph positioning + v := args[0] + for i := 0; i < v.Len(); i++ { + x := v.Index(i) + if x.Kind() == String { + showText(x.RawString()) + } else { + tx := -x.Float64() / 1000 * g.Tfs * g.Th + g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) + } } - } - case "TL": // set text leading - if len(args) != 1 { - return errors.New("bad TL") - } - g.Tl = args[0].Float64() + case "TL": // set text leading + if len(args) != 1 { + return errors.New("bad TL") + } + g.Tl = args[0].Float64() - case "Tm": // set text matrix and line matrix - if len(args) != 6 { - return errors.New("bad g.Tm") - } - var m matrix - for i := 0; i < 6; i++ { - m[i/2][i%2] = args[i].Float64() - } - m[2][2] = 1 - g.Tm = m - g.Tlm = m + case "Tm": // set text matrix and line matrix + if len(args) != 6 { + return errors.New("bad g.Tm") + } + var m matrix + for i := 0; i < 6; i++ { + m[i/2][i%2] = args[i].Float64() + } + m[2][2] = 1 + g.Tm = m + g.Tlm = m - case "Tr": // set text rendering mode - if len(args) != 1 { - return errors.New("bad Tr") - } - g.Tmode = int(args[0].Int64()) + case "Tr": // set text rendering mode + if len(args) != 1 { + return errors.New("bad Tr") + } + g.Tmode = int(args[0].Int64()) - case "Ts": // set text rise - if len(args) != 1 { - return errors.New("bad Ts") - } - g.Trise = args[0].Float64() + case "Ts": // set text rise + if len(args) != 1 { + return errors.New("bad Ts") + } + g.Trise = args[0].Float64() - case "Tw": // set word spacing - if len(args) != 1 { - return errors.New("bad g.Tw") - } - g.Tw = args[0].Float64() + case "Tw": // set word spacing + if len(args) != 1 { + return errors.New("bad g.Tw") + } + g.Tw = args[0].Float64() - case "Tz": // set horizontal text scaling - if len(args) != 1 { - return errors.New("bad Tz") + case "Tz": // set horizontal text scaling + if len(args) != 1 { + return errors.New("bad Tz") + } + g.Th = args[0].Float64() / 100 } - g.Th = args[0].Float64() / 100 + return nil + }) + if err != nil { + return Content{}, err } - return nil - }) - if err != nil { - return Content{}, err } + return Content{text, rect}, nil }