From b5493b86d33f6adbbb1d45b0bcb6d14a6d78251c Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Wed, 26 Jun 2024 22:01:05 +0200 Subject: [PATCH 1/5] clean up --- .gitignore | 3 +++ _version.py | 16 ---------------- pyproject.toml | 2 +- utils/__pycache__/bids_split.cpython-312.pyc | Bin 2434 -> 0 bytes utils/__pycache__/pdf_split.cpython-312.pyc | Bin 2016 -> 0 bytes 5 files changed, 4 insertions(+), 17 deletions(-) delete mode 100644 _version.py delete mode 100644 utils/__pycache__/bids_split.cpython-312.pyc delete mode 100644 utils/__pycache__/pdf_split.cpython-312.pyc diff --git a/.gitignore b/.gitignore index 607cacd..c4b9df4 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ volumes/* + + +__pycache__ \ No newline at end of file diff --git a/_version.py b/_version.py deleted file mode 100644 index 3e85520..0000000 --- a/_version.py +++ /dev/null @@ -1,16 +0,0 @@ -# file generated by setuptools_scm -# don't change, don't track in version control -TYPE_CHECKING = False -if TYPE_CHECKING: - from typing import Tuple, Union - VERSION_TUPLE = Tuple[Union[int, str], ...] -else: - VERSION_TUPLE = object - -version: str -__version__: str -__version_tuple__: VERSION_TUPLE -version_tuple: VERSION_TUPLE - -__version__ = version = '0.1.dev31+g470f6ce.d20240621' -__version_tuple__ = version_tuple = (0, 1, 'dev31', 'g470f6ce.d20240621') diff --git a/pyproject.toml b/pyproject.toml index 1461d11..78ec715 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ packages = ["LoxLM"] source = "vcs" [tool.hatch.build.hooks.vcs] -version-file = "_version.py" +version-file = "src/_version.py" [tool.pytest.ini_options] addopts = "-ra -vv" diff --git a/utils/__pycache__/bids_split.cpython-312.pyc b/utils/__pycache__/bids_split.cpython-312.pyc deleted file mode 100644 index 941c95312b5663b4dbdea69587749084e456731d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2434 zcmb7GO-vhC5T3U`{sRiB7u)Cuu7pS7Dv^kVCk;)PE)Fz5lwHGXsU~Q9APmy|X;Sd$|(0RLFn?&iM5Ba^B zH#6_e%=>1R-)d@nfa5`9eDJIb;19+)LnslOeMroM0t!h(oUL4%jB;^~Fq%i2kMjgL zI0=d{4T^Y&vn}JoBni8oG3Iobm^(jG<#DQ|F00YuBu&b?N(a;{djD8Dr6Y|RjUOiI zMI}3tIi)6*7p1a&q&JR> z5EmysVXs;JZc5Rdo2NqEkqgN0)5cUeHJFl!cgda7K58&~ zYZx%%b?C~9XYx+wb1uEgkr;v@-sKz#o>TS<X_D(eVy{-t5Kw)x?4gp&~F*~9vW zt>@AbdN!f4*GyzHVb1i)!{eEY2`x3Hnl(0^y{yu7att4U$LvyuJ<3vJR*fpQDFVUt z%QQKWMt<9CP!_#eqa$D{inaD_N>&~pRWrI~x~vT?>^22WO%KuW>UaWuLkeX3xQqR)J zxSmRD7I18jy2mC>Pa=`ZU;+|T_1h8F?QFA~w2yJR2E_nK!5LxJQ)us<5gvM4pEUKX zevo+B^eN*)1u3*H?afPjm-?3jYf|5g_esm)Um90c`mjY~8jXcO<9eVgALv>S^yLG6 zD-COb=#21r)g~Sfav3eiuott?_a~xh9)=*t>t(!+l&Y0kIm0O9E(YiC1AKzkIUy(J zT!t$K*WEHV1TuF%fV*zPJyJ%jV-MIX9K>j3w%%}m!OeyQ$hlA-PdI;uS?Vks)-Z&d zSh3*PMDGFnd-2L&T_k zrKqv?az2GOe3uEv<}rL2TNTD8=6p==e3;46haDA<9I6zjZ+o1%s{e?hs^Zh^9sy<_ zpdWlc=AGa#=bnIe*bf?+K>i#M{F}Xdg+B)qq@>}SfUt0Sitip(wn$`)bYt0ZSvSVt-6et$4>B9O#0euZUHhp$^v6`c-DTJ%39~Zl-Bu3RUHGyWc zx+y3rs+r<~kO{!md4!rPzY9r2mk+om)BcC=$Ifuqwr7 zj^k7awQK-ibBfFfPwE>M1{O{%wk^gMdsiFZT&v$x1a8MEvUI7?)cketm%TUp=3)h@ zWnJpXOC5^?>zzIM&YtzoWBJZwKS^tyCmu>CpNq`Gwkz`N;GzdQx*q!vEku_NF3aB^ zy%AmYADW5bR0uY009Uhnjz?-e7|91C%Wp65S|Ka#EA`8@Yr$i4LZPlW`D>Lgm_jHe4Vb*51JU+zLcnT|Wvq{ak-I+UdTx z(;MCIy4NM(e7{qSc2~*e%13oZTb-d13H&a>2(B-Weh3rn^YnET;KDGVF~e5t@l4;4 zTI$K#RPcp2tO&3sz8}%G zq}T!b;5!664%N=4w3NT_NM2TZ9{j;OcxRH>Il0b*^Tm!7z#$)O^r{@L}~0kvcK&!3t9 z-~9dm&Hp_T2_qO{cV_$xg3w=l5e!&+W9KAH%g8_mwor!8f`#)!M!;MbEiq3r1S0`m zKn7VrhV(%29W&A#R^=CbF`{C(_2S&c^!bmAS;M5TJp8Gtmnh54m_w6Un$>NSUNXP7 zN2jfv4Z7}Q__7X&cKTqwj7*fl2FeH_#D9il;02r!4PgOgh#`WM=7=i8g~oUTF8CS_ z$aN9^J1ZcTk%Jwif#wefP4TnWhJznqf2&MvDL8ZlNBj{7-wP-WPe73atL8ivYU+XJ zkN_F{8_07=6-HFS4eOJo!ex!+=1n)^>%|$9TG?s%K#@@$ENFJt(x#1ZM)`-i5gXE| zv3efe{hp>7MZJ_a3pR6QZ-c3lOPFbmQ_cXYaKWQShNLe8EneU&=*VS4n~gjq0Y)-KXi$qSDal~4 z+XA<+4?tW;TMBBASIDAL>*!k6ZfdKe8y#vT{H*)*Z;3UNKJI1}@>ogKVu{UI|9Y%{ zGj?)4cJfhTBbKg^Z(DZ&1nB`=_=015rCx;cEu#rk7VLT<5t`a-$*o~D3!O$|-y`%T zc$dX8DNBx&MmJ<#n1CC`Vz4Va@>HW>zBiXfgDMO!b~y4)VX>2-vIO>kFm{}K+B`0A zIO1)1P>y^XeJ2J)D7QGme^RAosx|Oy&JXHp;EmDZ34$B~(JJ+5p|EAe5FKR(LrjXJ zKuoO=6Dcd)ZcOFpK-owWHPGtuDX+vA`k(khdtw+MXz>ojA&f%4F#OB%Y@jK#NF^il zWDsbwi&MLJP;GIAB6G=X*1GH}X2D>yz%I&5=gPom*l~#ky@T=$sA0MvCZD=dZbN9X zXuHJ7QRYhHIm-krKDjb?GN~)VkxOhdKg~GdJyv^y9^gnEbw0M2oy}*(FJOOyxQ=Qa zabRQbk$dqw@oL-MH@A@z>R;d2T}dw`SEO3sz|;2O6{BkZRK8z+^!bxi`qrhj_TkDf zh+1d&HWDJom&miOWR_#-qEWKAzE9Jx zl(Lq8LJxweS8SRD?LP7VeG2e?O}XH2n(^xw5G22|r*^ov)_Y{9r9~RX+Xq{v#7+WB zhkQ?NugWwIlbf`%g$aE!n=5E~F`q9%%bObv`mW~lkolDEY?{8(t^F46MM2?F?g?*K mIpcrQS$Y%>crIBdh%FIg{09m>M`xa+p?^ph22z0V!uuC0-OwEX From caa06ed9000b0e5816530e4f1550eb2e77064a4f Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Wed, 26 Jun 2024 22:20:26 +0200 Subject: [PATCH 2/5] update gitignore --- .gitignore | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c4b9df4..cbc1f2b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,166 @@ volumes/* +src/_version.py -__pycache__ \ No newline at end of file +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ From 8f79d6d1ded0169cbbbe99752659bcf654f749fa Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Wed, 26 Jun 2024 22:25:58 +0200 Subject: [PATCH 3/5] minor fixes with precommit --- docker-compose.yml | 37 +++++++++++++++-------------- src/utils/bids_split.py | 2 +- src/utils/example_loader.py | 6 ++--- src/utils/multi_example_selector.py | 10 ++++---- src/utils/openneuro_extractor.py | 6 ++--- 5 files changed, 31 insertions(+), 30 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4f109fc..0d38e84 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,4 @@ +--- version: '3.5' services: @@ -5,15 +6,15 @@ services: container_name: milvus-etcd image: quay.io/coreos/etcd:v3.5.5 environment: - - ETCD_AUTO_COMPACTION_MODE=revision - - ETCD_AUTO_COMPACTION_RETENTION=1000 - - ETCD_QUOTA_BACKEND_BYTES=4294967296 - - ETCD_SNAPSHOT_COUNT=50000 + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd healthcheck: - test: ["CMD", "etcdctl", "endpoint", "health"] + test: [CMD, etcdctl, endpoint, health] interval: 30s timeout: 20s retries: 3 @@ -25,13 +26,13 @@ services: MINIO_ACCESS_KEY: minioadmin MINIO_SECRET_KEY: minioadmin ports: - - "9001:9001" - - "9000:9000" + - 9001:9001 + - 9000:9000 volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data command: minio server /minio_data --console-address ":9001" healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + test: [CMD, curl, -f, http://localhost:9000/minio/health/live] interval: 30s timeout: 20s retries: 3 @@ -39,7 +40,7 @@ services: standalone: container_name: milvus-standalone image: milvusdb/milvus:v2.4.1 - command: ["milvus", "run", "standalone"] + command: [milvus, run, standalone] security_opt: - seccomp:unconfined environment: @@ -47,20 +48,20 @@ services: MINIO_ADDRESS: minio:9000 APIFY_API_KEY: apify_api_qUQEtSba0coNeeNI8awoz8HnDUph7k2CHSGO volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + test: [CMD, curl, -f, http://localhost:9091/healthz] interval: 30s start_period: 90s timeout: 20s retries: 3 ports: - - "19530:19530" - - "9091:9091" + - 19530:19530 + - 9091:9091 depends_on: - - "etcd" - - "minio" + - etcd + - minio networks: default: - name: milvus \ No newline at end of file + name: milvus diff --git a/src/utils/bids_split.py b/src/utils/bids_split.py index 9f14171..e7c6930 100644 --- a/src/utils/bids_split.py +++ b/src/utils/bids_split.py @@ -60,4 +60,4 @@ def clean_text(self, text): def get_splits(self): return self.splits - + diff --git a/src/utils/example_loader.py b/src/utils/example_loader.py index b8aa388..83f6cbc 100644 --- a/src/utils/example_loader.py +++ b/src/utils/example_loader.py @@ -19,7 +19,7 @@ def __init__(self, test_split: float=.8, file: str | Path = "/home/rand/github/L manufacturer = e['Manufacturer'], model = e['ManufacturersModelName'], ) - for e + for e in examples_all ] if not (0 < test_split < 1): @@ -27,10 +27,10 @@ def __init__(self, test_split: float=.8, file: str | Path = "/home/rand/github/L ind = int(len(examples_all)*test_split) self.examples_test = examples_all[:ind] self.examples_store = examples_all[ind:] - + except FileNotFoundError: raise(FileNotFoundError("File Not Found")) - + def filter_types(self, types: list,): pass diff --git a/src/utils/multi_example_selector.py b/src/utils/multi_example_selector.py index 9020d91..d9417c7 100644 --- a/src/utils/multi_example_selector.py +++ b/src/utils/multi_example_selector.py @@ -13,7 +13,7 @@ class Example(BaseModel): index: str series_description: str protocol_name: str - task_name: str = float('nan') + task_name: str = float('nan') repetition_time: float = float('nan') echo_time: float = float('nan') inversion_time: float = float('nan') @@ -46,7 +46,7 @@ def eval_distance(self, value): def clean_examples(self, examples: list): return [example for example in examples if example is not None and example != "NA" and example is not float('nan')] - + class FloatExampleRanker(BaseExampleRanker): def __init__(self,examples: list): @@ -98,7 +98,7 @@ def eval_distance(self, example): def embed(self, examples): return self.model.encode(examples) - + class MultiExampleSelector(BaseExampleSelector): def __init__(self, examples: [Example], k: int = None, model = SentenceTransformer("BAAI/bge-large-en-v1.5")): @@ -142,6 +142,6 @@ def select_examples(self, input: Example, k: int =3): sorted_examples = sorted(example_dist, key =lambda x: x[1]) print(sorted_examples[:10]) return sorted_examples[:k] - - + + diff --git a/src/utils/openneuro_extractor.py b/src/utils/openneuro_extractor.py index fb88077..74b5ca6 100644 --- a/src/utils/openneuro_extractor.py +++ b/src/utils/openneuro_extractor.py @@ -58,13 +58,13 @@ def scan_dir(path): break except: print(f"Failed to load {root}/{file}") - + dirs = [dir for dir in dirs if "." not in dir] for dir in dirs: little_dic = scan_dir(f"{root}/{dir}") dic.update(little_dic) return dic - + # %% dic = {} @@ -77,6 +77,6 @@ def scan_dir(path): with open(f'descriptions_{start_index}-{end_index}', 'w') as f: json.dump(dic, f) - + From b68c9e3a46ab60bc9315893df09ddfc14cd2112a Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Wed, 26 Jun 2024 22:57:19 +0200 Subject: [PATCH 4/5] misc --- .github/workflows/system_test.yml | 4 +- .github/workflows/test.yml | 12 +--- .pre-commit-config.yaml | 31 ++++----- pyproject.toml | 62 ++++++++++-------- rag.py | 19 +++--- src/loxlm/_version.py | 16 +++++ src/{ => loxlm}/context_retriever.py | 10 +-- src/{ => loxlm}/rag_input.py | 21 +++--- src/{ => loxlm}/utils/bids_split.py | 8 ++- src/{ => loxlm}/utils/example_loader.py | 7 +- src/{ => loxlm}/utils/extract_cleaner.py | 0 .../utils/multi_example_selector.py | 11 ++-- src/{ => loxlm}/utils/openneuro.tsv | 0 src/{ => loxlm}/utils/openneuro_extractor.py | 5 +- src/{ => loxlm}/utils/pdf_split.py | 2 + src/test/context_retriever.py | 63 ------------------ custom_selector.py => test/custom_selector.py | 10 +-- test/test_context_retriever.py | 65 +++++++++++++++++++ 18 files changed, 180 insertions(+), 166 deletions(-) create mode 100644 src/loxlm/_version.py rename src/{ => loxlm}/context_retriever.py (91%) rename src/{ => loxlm}/rag_input.py (92%) rename src/{ => loxlm}/utils/bids_split.py (94%) rename src/{ => loxlm}/utils/example_loader.py (96%) rename src/{ => loxlm}/utils/extract_cleaner.py (100%) rename src/{ => loxlm}/utils/multi_example_selector.py (97%) rename src/{ => loxlm}/utils/openneuro.tsv (100%) rename src/{ => loxlm}/utils/openneuro_extractor.py (99%) rename src/{ => loxlm}/utils/pdf_split.py (99%) delete mode 100644 src/test/context_retriever.py rename custom_selector.py => test/custom_selector.py (92%) create mode 100644 test/test_context_retriever.py diff --git a/.github/workflows/system_test.yml b/.github/workflows/system_test.yml index 16f168e..7c83690 100644 --- a/.github/workflows/system_test.yml +++ b/.github/workflows/system_test.yml @@ -27,7 +27,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install dev + - name: Install run: | python -m pip install --upgrade pip - pip install .[dev] + pip install .[test] diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9339598..3c025ce 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,13 +30,5 @@ jobs: - name: Install run: | python -m pip install --upgrade pip - pip install -e .[tests] - - name: Run tests and coverage - run: make coverage - - name: Code coverage - uses: codecov/codecov-action@v3 - with: - file: coverage.xml - flags: ${{ matrix.os }}-${{ matrix.python-version }} - name: codecov - fail_ci_if_error: false + pip install -e .[test] + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3caa31d..dacfea7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,24 +26,12 @@ repos: hooks: - id: flynt -- repo: https://github.com/asottile/reorder-python-imports - rev: v3.10.0 - hooks: - - id: reorder-python-imports - args: [--py38-plus, --add-import, from __future__ import annotations] - -# - repo: https://github.com/pre-commit/mirrors-mypy -# rev: v1.3.0 -# hooks: -# - id: mypy -# additional_dependencies: [types-all] -# files: bidspm -# args: [--config-file, setup.cfg] - -- repo: https://github.com/MarcoGorelli/auto-walrus - rev: v0.2.2 - hooks: - - id: auto-walrus +# Sorts Python imports alphabetically and by section with `isort`. +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + args: [--profile, black, --settings-path, pyproject.toml] - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt rev: 0.2.3 @@ -51,6 +39,13 @@ repos: - id: yamlfmt args: [--mapping, '2', --sequence, '2', --offset, '0'] +# Format TOML files +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.13.0 + hooks: + - id: pretty-format-toml + args: [--autofix, --indent, '4'] + - repo: https://github.com/codespell-project/codespell rev: v2.2.5 hooks: diff --git a/pyproject.toml b/pyproject.toml index 78ec715..3cd8f97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,45 +1,55 @@ [build-system] -requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" +requires = ["hatchling", "hatch-vcs"] [project] -name = "LoxLM" +authors = [{name = "James Randolph", email = "jarandolph77@gmail.com"}] +dependencies = ["langchain", "langchain_core", "langchain_community" , "langchain_milvus", "sentence-transformers"] description = "LLM based DICOM to BIDS conversion aid." +dynamic = ["version"] +license = {file = "LICENSE.txt"} +name = "loxlm" readme = "README.md" requires-python = ">=3.8" -license = { file = "LICENSE.txt" } -authors = [{ name = "James Randolph", email = "jarandolph77@gmail.com" }] -dynamic = ["version"] -dependencies = ["langchain", "langchain_core", "langchain_community"] [project.optional-dependencies] doc = [ - "sphinx", - "sphinx-argparse", - "sphinx-copybutton", - "sphinx_rtd_theme", - "myst-parser", - "rstcheck", + "sphinx", + "sphinx-argparse", + "sphinx-copybutton", + "sphinx_rtd_theme", + "myst-parser", + "rstcheck" +] +# For running unit and docstring tests +test = [ + "coverage", + "pytest>=6.0.0", + "pytest-cov" ] -[tool.hatch.build.targets.wheel] -packages = ["LoxLM"] -[tool.hatch.version] -source = "vcs" +[tool.black] +line-length = 100 + +[tool.codespell] +skip = "./.git,.mypy_cache,env,venv,tests,*bval,*bvec" + [tool.hatch.build.hooks.vcs] -version-file = "src/_version.py" +version-file = "src/loxlm/_version.py" -[tool.pytest.ini_options] -addopts = "-ra -vv" +[tool.hatch.build.targets.wheel] +packages = ["src/loxlm"] -[tool.coverage.run] -branch = true -source = ["LoxLM/"] +[tool.hatch.version] +source = "vcs" -[tool.codespell] -skip = "./.git,.mypy_cache,env,venv,tests,*bval,*bvec" +[tool.pytest.ini_options] +addopts = "-ra -vv" -[tool.black] -line-length = 100 +[tool.isort] +combine_as_imports = true +line_length = 79 +profile = "black" +skip_gitignore = true diff --git a/rag.py b/rag.py index 095cff9..0cb8c50 100644 --- a/rag.py +++ b/rag.py @@ -1,22 +1,21 @@ -import os -import json +from __future__ import annotations -from langchain_community.vectorstores import Milvus as m - -from langchain_community.embeddings import HuggingFaceBgeEmbeddings -from langchain_community.llms import Ollama +import json +import os from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler - -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.runnables import RunnablePassthrough +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_community.llms import Ollama +from langchain_community.vectorstores import Milvus as m from langchain_core.example_selectors import SemanticSimilarityExampleSelector +from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts.few_shot import FewShotPromptTemplate from langchain_core.prompts.prompt import PromptTemplate - +from langchain_core.runnables import RunnablePassthrough from utils.bids_split import BidsSplitter from utils.pdf_split import PdfSplitter + #Database Parameters print("Database Parameters") URI = 'http://localhost:19530' diff --git a/src/loxlm/_version.py b/src/loxlm/_version.py new file mode 100644 index 0000000..8f87596 --- /dev/null +++ b/src/loxlm/_version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '0.1.dev40+g8f79d6d.d20240626' +__version_tuple__ = version_tuple = (0, 1, 'dev40', 'g8f79d6d.d20240626') diff --git a/src/context_retriever.py b/src/loxlm/context_retriever.py similarity index 91% rename from src/context_retriever.py rename to src/loxlm/context_retriever.py index e8780a4..dafa7e0 100644 --- a/src/context_retriever.py +++ b/src/loxlm/context_retriever.py @@ -1,11 +1,11 @@ -from langchain_community.embeddings import HuggingFaceBgeEmbeddings +import json +from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_milvus.vectorstores import Milvus as m -from utils.bids_split import BidsSplitter -from utils.pdf_split import PdfSplitter -from utils.example_loader import ExampleLoader -import json +from loxlm.utils.bids_split import BidsSplitter +from loxlm.utils.example_loader import ExampleLoader +from loxlm.utils.pdf_split import PdfSplitter URI = "http://localhost:19530" diff --git a/src/rag_input.py b/src/loxlm/rag_input.py similarity index 92% rename from src/rag_input.py rename to src/loxlm/rag_input.py index 8f7840b..7059e3e 100644 --- a/src/rag_input.py +++ b/src/loxlm/rag_input.py @@ -2,8 +2,6 @@ import json -from langchain.agents import AgentExecutor -from langchain.agents import create_structured_chat_agent from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.tools.retriever import create_retriever_tool @@ -11,21 +9,18 @@ from langchain_community.llms import Ollama from langchain_core.example_selectors import SemanticSimilarityExampleSelector from langchain_core.output_parsers import PydanticOutputParser -from langchain_core.prompts.chat import ChatPromptTemplate -from langchain_core.prompts.chat import HumanMessagePromptTemplate -from langchain_core.prompts.chat import SystemMessagePromptTemplate +from langchain_core.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) from langchain_core.prompts.few_shot import FewShotChatMessagePromptTemplate -from langchain_core.pydantic_v1 import BaseModel -from langchain_core.pydantic_v1 import Field +from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.runnables import RunnablePassthrough from langchain_milvus.vectorstores import Milvus as m - from utils.bids_split import BidsSplitter -from utils.pdf_split import PdfSplitter from utils.example_loader import ExampleLoader -import os -import re -from langchain_core.runnables import RunnablePassthrough -from langchain_core.tools import tool +from utils.pdf_split import PdfSplitter # Database Parameters print("Database Parameters") diff --git a/src/utils/bids_split.py b/src/loxlm/utils/bids_split.py similarity index 94% rename from src/utils/bids_split.py rename to src/loxlm/utils/bids_split.py index e7c6930..e2b741f 100644 --- a/src/utils/bids_split.py +++ b/src/loxlm/utils/bids_split.py @@ -1,10 +1,12 @@ import os import re -from langchain_text_splitters import RecursiveCharacterTextSplitter -from langchain_text_splitters import MarkdownTextSplitter + import nltk -from nltk.tokenize import word_tokenize +from langchain_text_splitters import ( + MarkdownTextSplitter, +) from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize nltk.download('stopwords') diff --git a/src/utils/example_loader.py b/src/loxlm/utils/example_loader.py similarity index 96% rename from src/utils/example_loader.py rename to src/loxlm/utils/example_loader.py index 83f6cbc..9c9c6c4 100644 --- a/src/utils/example_loader.py +++ b/src/loxlm/utils/example_loader.py @@ -1,6 +1,9 @@ -from .multi_example_selector import Example -from pathlib import Path import json +from pathlib import Path + +from loxlm.utils.multi_example_selector import Example + + class ExampleLoader: def __init__(self, test_split: float=.8, file: str | Path = "/home/rand/github/LoxLM/LoxLM/src/utils/example_loader.py",): try: diff --git a/src/utils/extract_cleaner.py b/src/loxlm/utils/extract_cleaner.py similarity index 100% rename from src/utils/extract_cleaner.py rename to src/loxlm/utils/extract_cleaner.py diff --git a/src/utils/multi_example_selector.py b/src/loxlm/utils/multi_example_selector.py similarity index 97% rename from src/utils/multi_example_selector.py rename to src/loxlm/utils/multi_example_selector.py index d9417c7..781f014 100644 --- a/src/utils/multi_example_selector.py +++ b/src/loxlm/utils/multi_example_selector.py @@ -1,13 +1,10 @@ -from langchain_core.example_selectors.base import BaseExampleSelector -import statistics from abc import ABC, abstractmethod -from sentence_transformers import SentenceTransformer -from typing import Optional, Union + import numpy as np +from langchain_core.example_selectors.base import BaseExampleSelector from pydantic import BaseModel, validator -import typing -import scipy -from sklearn import preprocessing +from sentence_transformers import SentenceTransformer + class Example(BaseModel): index: str diff --git a/src/utils/openneuro.tsv b/src/loxlm/utils/openneuro.tsv similarity index 100% rename from src/utils/openneuro.tsv rename to src/loxlm/utils/openneuro.tsv diff --git a/src/utils/openneuro_extractor.py b/src/loxlm/utils/openneuro_extractor.py similarity index 99% rename from src/utils/openneuro_extractor.py rename to src/loxlm/utils/openneuro_extractor.py index 74b5ca6..7de6582 100644 --- a/src/utils/openneuro_extractor.py +++ b/src/loxlm/utils/openneuro_extractor.py @@ -1,8 +1,9 @@ # %% -import pandas as pd -import os import json +import os import subprocess + +import pandas as pd import yaml bids_suffix_path = "./bids-schema/versions/master/schema/objects/suffixes.yaml" diff --git a/src/utils/pdf_split.py b/src/loxlm/utils/pdf_split.py similarity index 99% rename from src/utils/pdf_split.py rename to src/loxlm/utils/pdf_split.py index 77523f9..325cc68 100644 --- a/src/utils/pdf_split.py +++ b/src/loxlm/utils/pdf_split.py @@ -1,7 +1,9 @@ import os + from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter + class PdfSplitter: def __init__(self, diff --git a/src/test/context_retriever.py b/src/test/context_retriever.py deleted file mode 100644 index 05ec74e..0000000 --- a/src/test/context_retriever.py +++ /dev/null @@ -1,63 +0,0 @@ -from langchain_community.embeddings import HuggingFaceBgeEmbeddings - -from langchain_milvus.vectorstores import Milvus as m - -from .utils.bids_split import BidsSplitter -from pdf_split import PdfSplitter -from src.utils.example_loader import ExampleLoader -import json - -URI = "http://localhost:19530" - -connection_args = {"uri": URI} - -CONTEXT_COLLECTION = "context_db" - -bids_splitter = BidsSplitter() -bids_splits = bids_splitter.get_splits() - -pdf_splitter = PdfSplitter() -pdf_splits = pdf_splitter.get_splits() - -all_context = bids_splits - -print("Embedding Model Load") -# Embedding Model Initialization - -model_name = "BAAI/bge-small-en" -model_kwargs = {"device": "cuda"} -encode_kwargs = {"normalize_embeddings": True} -hf = HuggingFaceBgeEmbeddings( - model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs -) - -# Context -print("Context Store") -context_store = m( - embedding_function=hf, - connection_args=connection_args, - collection_name=CONTEXT_COLLECTION, - drop_old=True, -).from_documents( - all_context, embedding=hf, collection_name=CONTEXT_COLLECTION, connection_args=connection_args -) - -examples_test, _ = ExampleLoader().get_splits() -inputs = [test['h'] for test in examples_test] - - -# Construct Prompt -print("Retriever") -retriever = context_store.as_retriever(search_kwargs={"k": 2, "fetch_k": 10}) - -def format_docs(d): - return str(d) - -inputs = inputs[:5] - -context = [format_docs(retriever.invoke(input)) for input in inputs] - -out = zip(inputs, context) -data = json.dumps(out) -with open("context_results.json",'w') as f: - f.write(data) diff --git a/custom_selector.py b/test/custom_selector.py similarity index 92% rename from custom_selector.py rename to test/custom_selector.py index bf52cd8..eeaf18e 100644 --- a/custom_selector.py +++ b/test/custom_selector.py @@ -1,11 +1,12 @@ +from __future__ import annotations + import pytest -from src.utils.multi_example_selector import ( - BaseExampleRanker, +from sentence_transformers import SentenceTransformer +from loxlm.utils.multi_example_selector import ( FloatExampleRanker, SemanticExampleRanker, - MultiExampleSelector, ) -from sentence_transformers import SentenceTransformer + @pytest.fixture def model(): @@ -53,4 +54,3 @@ def test_eval_distance(float_ranker): def test_semantic_add_example(semantic_ranker): semantic_ranker.add_example("couch") - diff --git a/test/test_context_retriever.py b/test/test_context_retriever.py new file mode 100644 index 0000000..9546bfa --- /dev/null +++ b/test/test_context_retriever.py @@ -0,0 +1,65 @@ +import json + +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_milvus.vectorstores import Milvus as m + +from loxlm.utils.pdf_split import PdfSplitter +from loxlm.utils.example_loader import ExampleLoader +from loxlm.utils.bids_split import BidsSplitter + +def test_context_retriver(): + + URI = "http://localhost:19530" + + connection_args = {"uri": URI} + + CONTEXT_COLLECTION = "context_db" + + bids_splitter = BidsSplitter() + bids_splits = bids_splitter.get_splits() + + pdf_splitter = PdfSplitter() + pdf_splits = pdf_splitter.get_splits() + + all_context = bids_splits + + print("Embedding Model Load") + # Embedding Model Initialization + + model_name = "BAAI/bge-small-en" + model_kwargs = {"device": "cuda"} + encode_kwargs = {"normalize_embeddings": True} + hf = HuggingFaceBgeEmbeddings( + model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs + ) + + # Context + print("Context Store") + context_store = m( + embedding_function=hf, + connection_args=connection_args, + collection_name=CONTEXT_COLLECTION, + drop_old=True, + ).from_documents( + all_context, embedding=hf, collection_name=CONTEXT_COLLECTION, connection_args=connection_args + ) + + examples_test, _ = ExampleLoader().get_splits() + inputs = [test['h'] for test in examples_test] + + + # Construct Prompt + print("Retriever") + retriever = context_store.as_retriever(search_kwargs={"k": 2, "fetch_k": 10}) + + def format_docs(d): + return str(d) + + inputs = inputs[:5] + + context = [format_docs(retriever.invoke(input)) for input in inputs] + + out = zip(inputs, context) + data = json.dumps(out) + with open("context_results.json",'w') as f: + f.write(data) From 13bf8c8931b12aa6a72d5990d6d2148fd8224824 Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Wed, 26 Jun 2024 22:58:44 +0200 Subject: [PATCH 5/5] rm _version.py --- .gitignore | 2 +- src/loxlm/_version.py | 16 ---------------- 2 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 src/loxlm/_version.py diff --git a/.gitignore b/.gitignore index cbc1f2b..be1a821 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ volumes/* -src/_version.py +src/loxlm/_version.py # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/src/loxlm/_version.py b/src/loxlm/_version.py deleted file mode 100644 index 8f87596..0000000 --- a/src/loxlm/_version.py +++ /dev/null @@ -1,16 +0,0 @@ -# file generated by setuptools_scm -# don't change, don't track in version control -TYPE_CHECKING = False -if TYPE_CHECKING: - from typing import Tuple, Union - VERSION_TUPLE = Tuple[Union[int, str], ...] -else: - VERSION_TUPLE = object - -version: str -__version__: str -__version_tuple__: VERSION_TUPLE -version_tuple: VERSION_TUPLE - -__version__ = version = '0.1.dev40+g8f79d6d.d20240626' -__version_tuple__ = version_tuple = (0, 1, 'dev40', 'g8f79d6d.d20240626')