Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add database integration #6

Merged
merged 10 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ pip install -r requirements.txt
Run the Backend API on port 5000:

```bash
uvicorn main:app --reload --port 5000
uvicorn index.api.routes:app --reload --port 5000
```

### Run the Backend via Docker
Expand Down
75 changes: 74 additions & 1 deletion index/api/routes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import json
import logging
from typing import Dict

from fastapi import FastAPI
from fastapi import FastAPI, HTTPException
from starlette.middleware.cors import CORSMiddleware
from starlette.responses import RedirectResponse

from index.db.model import Terminology, Concept, Mapping
from index.repository.sqllite import SQLLiteRepository
from index.embedding import MPNetAdapter

app = FastAPI(
title="INDEX",
description="Intelligent data steward toolbox using Large Language Model embeddings "
Expand Down Expand Up @@ -31,6 +37,8 @@
)

logger = logging.getLogger("uvicorn.info")
repository = SQLLiteRepository()
embedding_model = MPNetAdapter()


@app.get("/", include_in_schema=False)
Expand All @@ -41,3 +49,68 @@ def swagger_redirect():
@app.get("/version", tags=["info"])
def get_current_version():
return app.version


@app.put("/terminologies/{id}", tags=["terminologies"])
async def create_or_update_terminology(id: str, name: str):
try:
terminology = Terminology(name=name, id=id)
repository.store(terminology)
return {"message": f"Terminology {id} created or updated successfully"}
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to create or update terminology: {str(e)}")


@app.put("/concepts/{id}", tags=["concepts"])
async def create_or_update_concept(id: str, terminology_id: str, name: str):
try:
terminology = repository.session.query(Terminology).filter(Terminology.id == terminology_id).first()
if not terminology:
raise HTTPException(status_code=404, detail=f"Terminology with id {terminology_id} not found")

concept = Concept(terminology=terminology, name=name, id=id)
repository.store(concept)
return {"message": f"Concept {id} created or updated successfully"}
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to create or update concept: {str(e)}")


@app.put("/mappings/{id}", tags=["mappings"])
async def create_or_update_mapping(id: str, concept_id: str, text: str):
try:
concept = repository.session.query(Concept).filter(Concept.id == concept_id).first()
if not concept:
raise HTTPException(status_code=404, detail=f"Concept with id {concept_id} not found")
embedding = embedding_model.get_embedding(text)
# Convert embedding from numpy array to list
embedding_list = embedding.tolist()
print(embedding_list)
mapping = Mapping(concept=concept, text=text, embedding=json.dumps(embedding_list))
repository.store(mapping)
return {"message": f"Mapping {id} created or updated successfully"}
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to create or update mapping: {str(e)}")


@app.post("/mappings", tags=["mappings"])
async def get_closest_mappings_for_text(text: str):
embedding = embedding_model.get_embedding(text).tolist()
print(embedding)
closest_mappings, similarities = repository.get_closest_mappings(embedding)
response_data = []
for mapping, similarity in zip(closest_mappings, similarities):
concept = mapping.concept
terminology = concept.terminology
response_data.append({
"concept": {
"id": concept.id,
"name": concept.name,
"terminology": {
"id": terminology.id,
"name": terminology.name
}
},
"text": mapping.text,
"similarity": similarity
})
return response_data
76 changes: 38 additions & 38 deletions index/db/model.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,51 @@
import pandas as pd
import json

import numpy as np
from sqlalchemy import Column, ForeignKey, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship

class Terminology:

def __int__(self, identifier: str, name: str):
self.identifier = identifier
self.name = name


class Concept:

def __init__(self, identifier: str, terminology: Terminology):
self.identifier = identifier
self.terminology = terminology
Base = declarative_base()


class Embedding:
class Terminology(Base):
__tablename__ = 'terminology'
id = Column(String, primary_key=True)
name = Column(String)

def __init__(self, embedding: [float], source: str):
self.embedding = embedding
self.source = source

def to_dataframe(self):
return pd.DataFrame(self.embedding, columns=[self.source])
def __init__(self, name: str, id: str):
self.name = name
self.id = id


class Variable:
class Concept(Base):
__tablename__ = 'concept'
id = Column(String, primary_key=True)
name = Column(String)
terminology_id = Column(String, ForeignKey('terminology.id'))
terminology = relationship("Terminology")

def __init__(self, name: str, description: str, source: str, embedding: Embedding = None):
def __init__(self, terminology: Terminology, name: str, id: str):
self.terminology = terminology
self.name = name
self.description = description
self.source = source
self.embedding = embedding
self.id = id


class Mapping:
class Mapping(Base):
__tablename__ = 'mapping'
id = Column(Integer, primary_key=True, autoincrement=True) # Auto-incrementing primary key
concept_id = Column(String, ForeignKey('concept.id'))
concept = relationship("Concept")
text = Column(Text)
embedding_json = Column(Text)

def __init__(self, concept: Concept, variable: Variable, source: str):
def __init__(self, concept: Concept, text: str, embedding: list):
self.concept = concept
self.variable = variable
self.source = source

def __eq__(self, other):
return self.concept.identifier == other.concept.identifier and self.variable.name == other.variable.name

def __hash__(self):
return hash((self.concept.identifier, self.variable.name))

def __str__(self):
return f"{self.variable.name} ({self.variable.description}) -> {self.concept.identifier}"
self.text = text
if isinstance(embedding, np.ndarray):
embedding = embedding.tolist()
self.embedding_json = json.dumps(embedding) # Store embedding as JSON

@property
def embedding(self):
return json.loads(self.embedding_json)
Loading
Loading