pop_notes

#!/usr/bin/python3.5
# -*- coding: utf-8 -*-

## requires /usr/bin/pip3 install python-poppler-qt5
## For mac: sudo port install poppler-qt5

import popplerqt5
import os, sys, urllib, re
import PyQt5


SUMM  = "Summary"             # Purple 
DNU   = "Did not understand"  # Red
NOTE  = "Note"                # Yellow
QUOTE = "Quote"               # Green
CITE  = "New Source"          # Blue

color_dict = {226 : SUMM,  # Purple 
              230 : SUMM,  # Purple 
              253 : DNU,   # Red/Pink
              248 : NOTE,  # Yellow
              255 : NOTE,  # Yellow
              178 : QUOTE, # Green
              188 : QUOTE, # Green
              164 : CITE,  # Blue
              165 : CITE   # Blue
             }


def main(filename, text, jtex, basepage = 1, quotes = False, silent = False):

    print(filename)

    doc = popplerqt5.Poppler.Document.load(filename)

    n_pages = doc.numPages()

    annotations = []

    for i in range(n_pages):

        #########
        ## Now use the popplerqt5 interface to get the highlights....
        ## following http://stackoverflow.com/questions/21050551/
        ## and https://people.freedesktop.org/~aacid/docs/qt5/classPoppler_1_1Annotation.html

        page = doc.page(i)
        (pwidth, pheight) = (page.pageSize().width(), page.pageSize().height())

        for annotation in page.annotations():

            # popplerqt5.Poppler.TextAnnotation
            # popplerqt5.Poppler.HighlightAnnotation
            if annotation.subType() not in [1, 4]: continue
               
            note = {}
            note["page"]    = basepage+i
            note["note"]    = annotation.contents()
            note["text"]    = ""

            try: note["context"] = color_dict[annotation.style().color().red()]
            except KeyError:

                if annotation.style().color().red() == 0 and \
                   annotation.style().color().green() == 0 and \
                   annotation.style().color().blue() == 0: continue

                print("Do not have color (r, g, b) = ({}, {}, {})".format( 
                      annotation.style().color().red(),
                      annotation.style().color().green(),
                      annotation.style().color().blue()))
                print("page", basepage+i, "  >>  ", note["text"])
                continue


            # retrieving the highlight is a bit more finnicky,
            # since PDFs aren't really designed for this.
            if isinstance(annotation, popplerqt5.Poppler.HighlightAnnotation):

                for quad in annotation.highlightQuads():
                    rect = (quad.points[0].x() * pwidth, quad.points[0].y() * pheight,
                            quad.points[2].x() * pwidth, quad.points[2].y() * pheight)

                    bdy = PyQt5.QtCore.QRectF()
                    bdy.setCoords(*rect)
                    if quotes:
                        txt = page.text(bdy).strip()
                        # print(txt)
                        if not txt: continue
                        if txt[-1] == "-": txt = txt[:-1].strip()
                        txt = txt.replace("-\n", "").replace("\n", " ")
                        txt = txt.replace("$", "\$").replace("#", "\#")
                        note["text"] += txt + " "

            annotations.append(note)

    with open(text + filename.replace(".pdf", ".txt").split("/")[-1], "w") as out:

        out.write(">* Notes for " + filename.replace(".pdf", "").split("/")[-1] + "\n\n")

        if any([v["context"] == SUMM for v in annotations]):
            out.write(">>>* Summary Impressions :: \\\\ \n")
            for v in annotations:
                if v["context"] is SUMM:
                    if v["note"]: out.write("{note}\n".format(**v))
                    if v["text"]: out.write("- {text}\n\n".format(**v))

            out.write("\n\n")

        if any([v["context"] == CITE for v in annotations]):
            out.write(">>>* Additional sources to check out :: \\\\ \n")
            for v in annotations:
                if v["context"] is CITE:
                    if v["note"]: out.write("{note}\n\n".format(**v))
                    if v["text"]: out.write("- {text}\n\n".format(**v))
            out.write("\n\n")
                
        if any([v["context"] == DNU for v in annotations]):
            out.write(">>>* Did not understand :: \\\\ \n")
            for v in annotations:
                if v["context"] is DNU:
                    if v["note"]: out.write("{note}\n".format(**v))
                    if v["text"]: out.write("- {text}\n\n".format(**v))
            out.write("\n\n")
                

        if any([v["context"] in [NOTE, QUOTE] for v in annotations]):
            out.write(">>>* Notes and Quotes :: \n")
            for v in annotations:
                if v["context"] in [NOTE, QUOTE]:
                    s = "-[{page:>4d}]".format(**v)
                    ##  if v["context"] == NOTE:
                    ##      if latex: s+= " \\textbf{N} "
                    ##      else: s += " :N: "
                    ##  if v["context"] == QUOTE:
                    ##      if latex: s+= " \\textbf{Q} "
                    ##      else: s += " :Q: "
                    if v["note"]: s += "{note}".format(**v)
                    if v["text"]: 
                        s += "\n\\begin{{lquote}}\n{text}\n\end{{lquote}}\n".format(**v)

                    s += "\n\n"

                    out.write(s)

    if jtex:
        with open(jtex + "/".join(filename.replace(".pdf", "").split("/")[-2:]), "w") as out:
            for line in open(text + filename.replace(".pdf", ".txt").split("/")[-1], "r"):
                if ">* Notes for" in line:
                    source  = filename.replace(".pdf", "").split("/")[-1]
                    section = " :: ".join(filename.replace(".pdf", "").split("/")[-2:]).replace("_", "\_")
                    line = "\\lrsection{{{}}}{{{}}}\n".format(source, section)
                out.write(line)

    if not silent:
        with open(text + filename.replace(".pdf", ".txt").split("/")[-1], "r") as out:
            for l in out: print(l[:-1])


if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("filename")
    parser.add_argument("-p", "--pages",  type=int, default = 1)
    parser.add_argument("-s", "--silent", action="store_true")
    parser.add_argument("-q", "--quotes", action="store_true")
    parser.add_argument("-t", "--text",   type=str, default = "notes/")
    parser.add_argument("-j", "--jtex",   type=str, default = "jtex/")
    args = parser.parse_args()

    main(args.filename, args.text, args.jtex, args.pages, args.quotes, args.silent)