forked from Stirling-Tools/Stirling-PDF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DockerfileBase
50 lines (36 loc) · 1.37 KB
/
DockerfileBase
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Main stage
FROM ubuntu:latest AS base
# JDK for app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
openjdk-17-jre
# Doc conversion
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libreoffice-core \
libreoffice-common \
libreoffice-writer \
libreoffice-calc \
libreoffice-impress \
python3-uno \
curl \
unoconv
# OCR MY PDF (unpaper for descew and other advanced featues)
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \
add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \
apt-get update && \
apt-get install -y --no-install-recommends \
ghostscript \
python3-pip \
ocrmypdf \
unpaper && \
pip install --upgrade pip && \
pip install --no-cache-dir --upgrade ocrmypdf && \
pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1
#CV and HTML
RUN pip install --no-cache-dir opencv-python-headless WeasyPrint
# cleanup and etc
RUN rm -rf /var/lib/apt/lists/* && \
mkdir /usr/share/tesseract-ocr-original && \
cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \
rm -rf /usr/share/tesseract-ocr