LSFLK · LakinduOshadha · May 4, 2023 · May 4, 2023 · May 10, 2023 · May 11, 2023
diff --git a/config.example.go b/config.example.go
@@ -1,12 +1,14 @@
 package GIG_Scripts
-//import "github.com/lsflk/gig-sdk/client"
+
+import "github.com/lsflk/gig-sdk/client"
+
 /**
 Set the GIG server API url here for crawlers
- */
-//var GigClient = client.GigClient{
-//	ApiUrl:                 "http://localhost:9000/api/",
-//	ApiKey:                 "$2a$12$dcKw7SVbheBUwWSupp1Pze7zOHJBlcgW2vuQGSEh0QVHC/KUeRgwW",
-//	NerServerUrl:           "http://localhost:8080/classify",
-//	NormalizationServerUrl: "http://localhost:9000/api/",
-//	OcrServerUrl:           "http://localhost:8081/extract?url=",
-//}
+*/
+var GigClient = client.GigClient{
+	ApiUrl:                 "<ApiUrl>",
+	ApiKey:                 "<ApiKey>",
+	NerServerUrl:           "<NerServerUrl>",
+	NormalizationServerUrl: "<NormalizationServerUrl>",
+	OcrServerUrl:           "<OcrServerUrl>",
+}
diff --git a/extract_orgchart_data/.gitignore b/extract_orgchart_data/.gitignore
@@ -0,0 +1,3 @@
+venv
+.idea
+helpers/__pycache__/
diff --git a/extract_orgchart_data/README.md b/extract_orgchart_data/README.md
@@ -0,0 +1,39 @@
+
+# extract_orgchart_data.py
+
+How to run : 
+
+    pip install -r requirements.txt
+    python extract_orgchart_data.py
+
+
+
+extract_orgchart_data.py can extract data (ministers and the corresponding departments) from the gazette PDFs which the data is in tabular format. The script downloads all the PDFs(English version) from that website directly and extracts data from those gazettes. 
+
+The script works as follows: 
+1. Download Gazette PDFs from the cabinetoffice website
+2. Convert those to docx
+3. Iterate through the tables and extract the Ministry and the departments(Which is in the Column II in tables)
+4. Save the extracted data in scripts/orgchart CSV format.
+
+
+Limitations : 
+
+ - This script is a rule-based one. So, it can only extract data from the gazette PDFs which is in tabular format in the given structure.
+
+
+Since this is a rule based script, it finds only the departments and ministers in the given structure. When the PDF is converted to docx, there may be some formatting issues in the tables (There is no ideal way to extract content from PDFs with exact same formatting). So, it will miss some data when extracting. Overall, it can extract all the ministries and most of the departments - there are some missing departments as well.
+
+Improvements:
+
+
+ - The gazettes in the  cabinetoffice website contains two types of data. 
+
+       1. Data in tabular format - mentioning the ministry and relevant departments under it in a table. Ex: gazette 2022-07-22
+       2. Data in Sentences - mentioning the amendments of the last gazette. (Like.. These departments should be removed from this ministry and these departments should be added under this). Ex: gazette 2022-10-05
+
+ - From this script, it can only identify the data in tabular format. Need to address the gazettes which the data is in sentences as well.
+
+ - When a PDF is converted into a docx, there are some miss-formattings in the tables. Because of this, the script can not identify "Column II" which contains the departments in some tables. Therefore, it can not extract some departments. This needs to be fixed.
+
+NOTE : Check extracted data before populate the GIG db
diff --git a/extract_orgchart_data/extract_orgchart_data.py b/extract_orgchart_data/extract_orgchart_data.py
@@ -0,0 +1,29 @@
+import os
+from helpers.extract_ministers_departments import extract_ministers_departments
+from helpers.write_to_csv import write_to_csv
+from helpers.crawl_pdfs import download_all_pdfs
+from helpers.get_pdf_names import get_pdf_names
+
+
+WEBSITE_URL = "http://www.cabinetoffice.gov.lk/cab/index.php?option=com_content&view=article&id=54&Itemid=93&lang=en"
+PDF_DIRECTORY = "pdfs"
+CSV_DIRECTORY = "extracted"
+
+if __name__ == "__main__":
+    try:
+        download_all_pdfs(WEBSITE_URL, PDF_DIRECTORY)
+        pdf_file_names = get_pdf_names(PDF_DIRECTORY)
+
+        for pdf_file_name in pdf_file_names:
+            # extract ministers and corresponding departments
+            pdf_location = os.path.join(os.getcwd(), PDF_DIRECTORY, pdf_file_name)
+            try:
+                extracted_data = extract_ministers_departments(pdf_location)
+                # writing to csv
+                write_to_csv(extracted_data, pdf_file_name, CSV_DIRECTORY)
+                extracted_data.clear()
+            except Exception as e:
+                print(f"Error processing PDF '{pdf_file_name}': {str(e)}")
+
+    except Exception as e:
+        print(f"Error occurred during PDF download: {str(e)}")
diff --git a/extract_orgchart_data/extract_orgchart_data.sh b/extract_orgchart_data/extract_orgchart_data.sh
@@ -0,0 +1 @@
+python extract_orgchart_data.py
diff --git a/extract_orgchart_data/helpers/crawl_pdfs.py b/extract_orgchart_data/helpers/crawl_pdfs.py
@@ -0,0 +1,77 @@
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+import os
+import requests
+
+def download_pdf(url, save_directory):
+    """
+    Downloads a PDF file from the given URL and saves it in the specified directory.
+
+    Args:
+        url (str): The URL of the PDF file to download.
+        save_directory (str): The directory where the downloaded PDF file will be saved.
+    """
+    response = requests.get(url)
+    file_name = os.path.join(save_directory, url.split("/")[-1])
+
+    try:
+        response.raise_for_status()  # Raises an exception for non-200 status codes
+        with open(file_name, 'wb') as file:
+            file.write(response.content)
+        print(f"PDF downloaded successfully from {url}")
+
+    except requests.HTTPError as e:
+        print(f"Error downloading PDF from {url}: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred while downloading PDF from {url}: {e}")
+    finally:
+        response.close()
+
+
+def get_pdf_links(url):
+    """
+    Retrieves the links to PDF files from the given URL.
+
+    Args:
+        url (str): The URL from which to retrieve the PDF links.
+
+    Returns:
+        list: A list of PDF links.
+    """
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    pdf_links = []
+    language = "E"
+    for link in soup.find_all('a'):
+        href = link.get('href')
+        if href and href.endswith(str(language + '.pdf')):
+            pdf_links.append(href)
+    return pdf_links
+
+def download_all_pdfs(url, save_directory):
+    """
+    Downloads all the PDF files from the given URL and saves them in the specified directory.
+
+    Args:
+        url (str): The URL from which to download the PDFs.
+        save_directory (str): The directory where the downloaded PDFs will be saved.
+    """
+    save_directory = os.path.join(os.getcwd(), save_directory)
+
+    if not os.path.exists(save_directory):
+        os.makedirs(save_directory)
+
+    domain_name = urlparse(url).scheme + "://" + urlparse(url).netloc
+
+    pdf_links = get_pdf_links(url)
+    print(f"Found {len(pdf_links)} PDFs to download.")
+
+    for link in pdf_links:
+        pdf_url = link if link.startswith('http') else domain_name + link
+        print(f"Downloading {pdf_url}...")
+        try: 
+            download_pdf(pdf_url, save_directory)
+        except Exception as e:
+            print(f"Error downloading {pdf_url}: {e}")
+
+    print("All PDFs downloaded successfully!")
diff --git a/extract_orgchart_data/helpers/extract_ministers_departments.py b/extract_orgchart_data/helpers/extract_ministers_departments.py
@@ -0,0 +1,209 @@
+import re
+import os
+from helpers.extract_pdf_text import extract_pdf_text
+
+
+COLUMN_HEADING = "Column I"
+NO_OF_COLUMNS_IN_TABLE = 3
+extracted_data = dict()
+
+def extract_ministers_departments(pdf_file):
+    """
+    Extracts the ministers and corresponding departments from the given PDF file.
+
+    Args:
+        pdf_file (str): The path to the PDF file.
+
+    Returns:
+        dict: A dictionary containing the extracted ministers and departments.
+    """
+
+    pdf_text = extract_pdf_text(pdf_file).body
+
+    print("Extracting ministers and departments...")
+    # iterate through the pdf_text lists
+    for i, data in enumerate(pdf_text):
+        for table_data in data:
+            # getting headings list in pdf_text
+            table_heading = table_data[0]
+
+            # extract ministers if  table_heading list contains "Column I"
+            if search_in_sublists(table_heading, COLUMN_HEADING):
+                extract_ministers(pdf_text, i)
+
+            extract_departments(table_data)
+
+    print("No: of Ministers Found : " , len(extracted_data))
+    for key, value in extracted_data.items():
+        print('Ministry :\t\t',key)
+        print('No. of Departments :\t',len(value))
+        print('Departments :\t\t',value,'\n\n')
+
+    print("Ministers and departments extracted successfully! PDF file: ", os.path.basename(pdf_file))
+    return extracted_data
+
+
+
+def is_department_cell(deparment_string):
+    """
+    Checks if the given cell is a department cell.
+
+    Args:
+        deparment_string (str): The cell to check.
+
+    Returns:
+        bool: True if the deparment_string is a department cell, False if it is a unwanted cell.
+    """
+
+    if "Column II" in deparment_string:
+        return False
+    if "Departments,  Statutory \nInstitutions & Public Corporations" in deparment_string:
+        return False
+    if "Departments, Statutory Institutions and Public Corporations" in deparment_string:
+        return False
+    if len(deparment_string) == 0:
+        return False
+    return True
+
+
+def extract_ministers(pdf_text, i):
+    """
+    Extracts the ministers from the PDF text.
+
+    Args:
+        pdf_text (list): The PDF text as a nested list.
+        i (int): The index of the current data in the pdf_text list.
+
+    Returns:
+        None
+    """
+
+    # getting list containing ministers and merging
+    minister_data = pdf_text[i-1][0][0][-1]
+    minister_data = ''.join(minister_data)
+
+    # check whether the minister_data is valid
+    minister_len = len(minister_data)
+    minimum_len = 10
+    if minister_len < minimum_len: return
+
+    # check whether the minister_data contains a number
+    temp = re.findall(r'\d+', minister_data)
+    no_lst_in_minister_str = list(map(int, temp))
+
+    # search for the minister number in minister_data
+    if len(no_lst_in_minister_str) > 0:
+        minister_name = clean_minister_data(minister_data)
+
+        if minister_name not in extracted_data:
+            extracted_data[minister_name] = []
+    return
+
+
+def extract_departments(table_data):
+    """
+    Extracts the departments from the table data.
+
+    Args:
+        table_data (list): The table data to extract departments from.
+
+    Returns:
+        None
+    """
+
+    # find the list which containing 3 columns in the table
+    if len(table_data) == NO_OF_COLUMNS_IN_TABLE:
+        # getting the 2nd column data to extract "Column II"
+        deparment_string = ''.join(table_data[1])
+
+        # checking whether it is the department cell
+        if is_department_cell(deparment_string):
+            # clean department names and add the list to extracted_data
+            department_lst = clean_department_data(deparment_string)
+
+            try:
+                minister_name = list(extracted_data.keys())[-1]
+                extracted_data[minister_name] = extracted_data[minister_name] + department_lst
+            except:
+                print("No Ministry Found")
+
+
+
+def clean_department_data(department_data):
+    """
+    Cleans the department data by removing unwanted characters and formatting the department names.
+
+    Args:
+        department_data (str): The department data to clean.
+
+    Returns:
+        list: A list of cleaned department names.
+    """
+
+    # Remove newlines and tabs
+    data = department_data.replace('\n', '').replace('\t', '').replace('�', ' ')
+
+    # Remove any non-printable characters
+    data = ''.join(c for c in data if c.isprintable())
+
+    # split the string by numbers and create a list
+    lst = re.split('[0-9]+', data)
+    for i,x in enumerate(lst):
+        lst[i] = x.replace('. ', '')
+
+    # remove empty strings and whitespace from list
+    lst = [x.strip() for x in lst if x.strip()]
+
+    # capitalize the first letter
+    lst = [x.capitalize() for x in lst]
+
+    return lst
+
+
+
+def clean_minister_data(merged_str):
+    """
+    Cleans the merged minister data by removing unwanted characters and formatting the minister name.
+
+    Args:
+        merged_str (str): The merged minister data to clean.
+
+    Returns:
+        str: The cleaned minister name.
+    """
+
+    # Remove "SCHEDULE" and "(Contd.)"
+    remove_text_lst = ["(Contd.)", "SCHEDULE"]
+    for remove_text in remove_text_lst:
+        merged_str = re.compile(re.escape(remove_text), re.IGNORECASE).sub('', merged_str)
+
+    # remove unnessasary characters
+    merged_str = merged_str.replace('.','').replace('•','').replace('/n','').replace('/t','')
+
+    # Remove all digits
+    merged_str = ''.join(c for c in merged_str if not c.isdigit())
+
+    # capitalize the first letter
+    merged_str = merged_str.capitalize()
+
+    # remove trailing spaces
+    return merged_str.strip()
+
+
+
+def search_in_sublists(sublist, search_term):
+    """
+    Searches for a search term in the sublists of a given list.
+
+    Args:
+        sublist (list): The sublist to search in.
+        search_term (str): The search term to look for.
+
+    Returns:
+        bool: True if the search term is found, False otherwise.
+    """
+
+    for item in sublist:
+        if search_term == item.strip():
+            return True
+    return False