Merge branch 'master' into documentatation

normcontrol · May 23, 2023 · b80b3cc · b80b3cc
2 parents 5453e16 + 47b478c
commit b80b3cc
Show file tree

Hide file tree

Showing 96 changed files with 4,803 additions and 2,051 deletions.
diff --git a/.github/workflows/tests-app.yml b/.github/workflows/tests-app.yml
@@ -0,0 +1,32 @@
+name: Run tests
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: pip install -r requirements.txt
+    - name: Install pytest
+      run: |
+          python -m pip install --upgrade pip
+          pip install pytest
+    - name: Test docx
+      run: pytest tests/docx/test*
+    - name: Codecov
+      uses: codecov/[email protected]
diff --git a/.pep8speaks.yml b/.pep8speaks.yml
diff --git a/README.md b/README.md
@@ -120,4 +120,4 @@ Your contacts. For example:
 [Andrei Berezhkov](https://github.com/a-berezhkov)
 
 [Galina Larionova](https://github.com/orgs/normcontrol/people/galinalar)
-
+
diff --git a/examples/pdf_example/PDFParserExample.py b/examples/pdf_example/PDFParserExample.py
@@ -0,0 +1,26 @@
+from src.PDF.PDFParser import PDFParser
+from os import walk
+
+for dir_path, dir_names, file_names in walk('.\\documents'):
+    for filename in file_names:
+        '''
+        Declare an object of the PDFParser class, in the initialization parameter,
+        which will indicate the path to the pdf file
+        '''
+        pdf_parser = PDFParser(path=dir_path + '\\' + filename)
+        lines = pdf_parser.lines
+        spaces = pdf_parser.line_spaces
+        tables = pdf_parser.list_of_table
+        list_of_picture = pdf_parser.pictures
+        '''
+        Using the get_elements method, we get a file of the UnifiedDocumentView type, 
+        which contains data about the entire text document and its structural elements
+        '''
+        document = pdf_parser.get_all_elements(lines, spaces, tables, list_of_picture)
+        # To write information about structural elements, use the write_CSV method, specifying the save path
+        # document.write_CSV(dir_path + '\\csv\\' + filename + '.csv')
+        '''
+        To create a JSON string from data about structural elements, which will later be sent to the classifier,
+        use the create_json_to_clasifier method, which takes a list of required fields as parameters
+        '''
+        json = document.create_json()
diff --git a/examples/pdf_example/documents/Отчёт по практике для парсинга.pdf b/examples/pdf_example/documents/Отчёт по практике для парсинга.pdf
diff --git a/requirements.txt b/requirements.txt
@@ -1,21 +1,31 @@
-certifi==2022.12.7
+certifi==2023.5.7
 cffi==1.15.1
-charset-normalizer==3.0.0
+charset-normalizer==2.0.0
 click==8.1.3
-cryptography==38.0.1
+cryptography==40.0.2
 defusedxml==0.7.1
-Flask==2.2.2
-guppy3==3.1.2
+guppy3
 idna==3.4
 itsdangerous==2.1.2
 Jinja2==3.1.2
 MarkupSafe==2.1.1
 odfpy==1.4.1
-pdfminer.six==20220524
-pdfplumber==0.7.5
-Pillow==9.2.0
+pdfplumber~=0.9.0
+Pillow
 pycparser==2.21
-requests==2.28.1
-urllib3==1.26.13
-Wand==0.6.10
-Werkzeug==2.2.2
+requests==2.30.0
+urllib3==2.0.2
+Wand==0.6.11
+Werkzeug
+lxmlx==2.0.2
+python-docx==0.8.11
+bestconfig==1.3.6
+fastapi~=0.95.1
+uvicorn~=0.22.0
+starlette~=0.26.1
+dacite==1.8.0
+tabulate
+tabula-py
+pydantic~=1.10.7
+lxml~=4.9.2
+tabula-py
diff --git a/src/Flask.py b/src/Flask.py