Merge pull request #16 from aozalevsky/cleanup-and-refactor

Cleanup and refactor
aozalevsky · Nov 11, 2023 · ccab9f8 · ccab9f8
2 parents 8e4c4f6 + 2ff4a7a
commit ccab9f8
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 199 deletions.
diff --git a/VectorDatabase.py b/VectorDatabase.py
@@ -302,4 +302,39 @@ def get_embeddings_for_pub(self, id):
             texts.append(fragment.content)
             embeddings.append(fragment.vector)
         text_embeddings = list(zip(texts, embeddings))
-        return text_embeddings
+        return text_embeddings
+
+# Class to represent a publication with attributes id, title, pmc, pubmed, and doi
+class Publication:
+
+    id = ""
+    title = ""
+    pmc = ""
+    pubmed = ""
+    doi = ""
+
+    def __init__(self, id, title, pmc, pubmed, doi):
+        self.id = id # (DOI) Unique identifier for the publication
+        self.title = title    # Title of the publication
+        self.pmc = pmc        # PubMed Central (PMC) Link
+        self.pubmed = pubmed  # PubMed Link
+        self.doi = doi # Digital Object Identifier (DOI) Link for the publication
+
+# Class to represent a fragment of a publication with attributes id, header, content, and vector
+class Fragment:
+
+
+    # Class variables to store default values for attributes
+    id = ""        
+    header = ""    
+    content = ""   
+    vector = ""    
+
+    def __init__(self, id, header, content, vector):
+        # Constructor to initialize the attributes of the Fragment object
+
+        # Set the attributes of the object with the values provided during instantiation
+        self.id = id          # (DOI) Unique identifier for the fragment
+        self.header = header  # Header or title of the fragment
+        self.content = content # Content or text of the fragment
+        self.vector = vector  # Vector representation of the fragment
diff --git a/config.json b/config.json
@@ -0,0 +1,4 @@
+{
+    "Emails": [],
+    "DEBUG": false
+}
diff --git a/context_retrieve.py b/context_retrieve.py
diff --git a/database_entities.py b/database_entities.py
diff --git a/analysis.py → document_analysis.py b/analysis.py → document_analysis.py
@@ -1,36 +1,18 @@
 
-from VectorDatabase import Lantern
-from database_entities import Publication, Fragment
+from VectorDatabase import Lantern, Publication, Fragment
 from google_sheets import SheetsApiClient
 
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chat_models import ChatOpenAI
 from langchain.chains import RetrievalQA
 from langchain import PromptTemplate
+from datetime import date
+
 
 class DocumentAnalyzer:
     """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results,
     and reports the results to the spreadsheet
-    """
-
-    keywords_groups = {
-        'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'],
-        'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'],
-        'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"],
-        'FRET': ['FRET',  "forster resonance energy transfer", "fluorescence resonance energy transfer"],
-        'AFM': ['AFM',  "atomic force microscopy" ],
-        'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"],
-        '3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"],
-        'Y2H': ['Y2H', "yeast two-hybrid"],
-        'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"],
-        'XRAY_TOMOGRAPHY': ["soft x-ray tomography"],
-        'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"],
-        'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"],
-        'EVOLUTION': ['coevolution', "evolutionary covariance"],
-        'PREDICTED': ["predicted contacts"],
-        'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"],
-        'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension']
-    }    
+    """  
 
     def __init__(self):
         # self.lantern = Lantern()
@@ -57,7 +39,8 @@ def process_publications(self, publications: [Publication]):
             else:
                 #print('paper not about cryo-em')
                 pass
-            rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""])
+            # add date if it's added 
+            rows.append([pub.doi, pub.title, "", str(date.today()), "", int(classification), response, ""])
 
         self.update_spreadsheet(rows, hits)
 
@@ -129,22 +112,12 @@ def paper_about_cryoem(text_embeddings: []):
         """
         return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings)
 
-    @staticmethod
-    def methods_string():
-        methods_string = ''
-        for i, (k, v) in enumerate(DocumentAnalyzer.keywords_groups.items()):
-            if i > 0:
-                methods_string += ' or '
-            methods_string += f'{k} ({", ".join(v)})'
-        return methods_string
-
 
 class LlmHandler:
     """pulled this straight from the hackathon code, should work though
     """
 
     def __init__(self):
-        self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100)
         self.llm=ChatOpenAI(
                 temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3
             )

diff --git a/hackathon_runner.py b/hackathon_runner.py
@@ -5,9 +5,7 @@
 from paperscraper.pdf import save_pdf
 from paperscraper.get_dumps import biorxiv
 
-from fragment import Fragment
-from publication import Publication
-from VectorDatabase import Lantern
+from VectorDatabase import Lantern, Fragment, Publication
 import openai
 from langchain.document_loaders.csv_loader import CSVLoader
 from langchain.embeddings.openai import OpenAIEmbeddings

diff --git a/tests/test.py b/tests/test.py
@@ -1,6 +1,4 @@
-from fragment import Fragment
-from publication import Publication
-from VectorDatabase import Latern
+from VectorDatabase import Lantern, Fragment, Publication
 from tqdm.auto import tqdm
 from sentence_transformers import SentenceTransformer
 import torch