Skip to content

Commit

Permalink
Merge pull request #16 from aozalevsky/cleanup-and-refactor
Browse files Browse the repository at this point in the history
Cleanup and refactor
  • Loading branch information
AntounMichael authored Nov 11, 2023
2 parents 8e4c4f6 + 2ff4a7a commit ccab9f8
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 199 deletions.
37 changes: 36 additions & 1 deletion VectorDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,4 +302,39 @@ def get_embeddings_for_pub(self, id):
texts.append(fragment.content)
embeddings.append(fragment.vector)
text_embeddings = list(zip(texts, embeddings))
return text_embeddings
return text_embeddings

# Class to represent a publication with attributes id, title, pmc, pubmed, and doi
class Publication:

id = ""
title = ""
pmc = ""
pubmed = ""
doi = ""

def __init__(self, id, title, pmc, pubmed, doi):
self.id = id # (DOI) Unique identifier for the publication
self.title = title # Title of the publication
self.pmc = pmc # PubMed Central (PMC) Link
self.pubmed = pubmed # PubMed Link
self.doi = doi # Digital Object Identifier (DOI) Link for the publication

# Class to represent a fragment of a publication with attributes id, header, content, and vector
class Fragment:


# Class variables to store default values for attributes
id = ""
header = ""
content = ""
vector = ""

def __init__(self, id, header, content, vector):
# Constructor to initialize the attributes of the Fragment object

# Set the attributes of the object with the values provided during instantiation
self.id = id # (DOI) Unique identifier for the fragment
self.header = header # Header or title of the fragment
self.content = content # Content or text of the fragment
self.vector = vector # Vector representation of the fragment
4 changes: 4 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"Emails": [],
"DEBUG": false
}
125 changes: 0 additions & 125 deletions context_retrieve.py

This file was deleted.

34 changes: 0 additions & 34 deletions database_entities.py

This file was deleted.

39 changes: 6 additions & 33 deletions analysis.py → document_analysis.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,18 @@

from VectorDatabase import Lantern
from database_entities import Publication, Fragment
from VectorDatabase import Lantern, Publication, Fragment
from google_sheets import SheetsApiClient

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from datetime import date


class DocumentAnalyzer:
"""Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results,
and reports the results to the spreadsheet
"""

keywords_groups = {
'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'],
'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'],
'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"],
'FRET': ['FRET', "forster resonance energy transfer", "fluorescence resonance energy transfer"],
'AFM': ['AFM', "atomic force microscopy" ],
'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"],
'3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"],
'Y2H': ['Y2H', "yeast two-hybrid"],
'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"],
'XRAY_TOMOGRAPHY': ["soft x-ray tomography"],
'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"],
'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"],
'EVOLUTION': ['coevolution', "evolutionary covariance"],
'PREDICTED': ["predicted contacts"],
'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"],
'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension']
}
"""

def __init__(self):
# self.lantern = Lantern()
Expand All @@ -57,7 +39,8 @@ def process_publications(self, publications: [Publication]):
else:
#print('paper not about cryo-em')
pass
rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""])
# add date if it's added
rows.append([pub.doi, pub.title, "", str(date.today()), "", int(classification), response, ""])

self.update_spreadsheet(rows, hits)

Expand Down Expand Up @@ -129,22 +112,12 @@ def paper_about_cryoem(text_embeddings: []):
"""
return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings)

@staticmethod
def methods_string():
methods_string = ''
for i, (k, v) in enumerate(DocumentAnalyzer.keywords_groups.items()):
if i > 0:
methods_string += ' or '
methods_string += f'{k} ({", ".join(v)})'
return methods_string


class LlmHandler:
"""pulled this straight from the hackathon code, should work though
"""

def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100)
self.llm=ChatOpenAI(
temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3
)
Expand Down
4 changes: 1 addition & 3 deletions hackathon_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from paperscraper.pdf import save_pdf
from paperscraper.get_dumps import biorxiv

from fragment import Fragment
from publication import Publication
from VectorDatabase import Lantern
from VectorDatabase import Lantern, Fragment, Publication
import openai
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
Expand Down
4 changes: 1 addition & 3 deletions tests/test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from fragment import Fragment
from publication import Publication
from VectorDatabase import Latern
from VectorDatabase import Lantern, Fragment, Publication
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import torch
Expand Down

0 comments on commit ccab9f8

Please sign in to comment.