pyproject.toml

[tool.poetry]
authors = ["Lilac AI Inc. <info@lilacml.com>"]
description = "Organize unstructured data"
license = "Apache-2.0"
name = "lilac"
repository = "https://github.com/lilacai/lilac"
packages = [{ include = "lilac" }]
include = ["lilac/web/**/*"]                    # Include the web app assets.
readme = "README.md"
version = "0.3.9"

[tool.poetry.dependencies]

### Required dependencies. ###
authlib = "^1.2.1"
click = "^8.1.3"
datasets = "^2.12.0"
duckdb = "^0.10.0"
fastapi = "^0.109.1"
fsspec = "^2023.9.2"
gcsfs = "^2023.9.2"
google-cloud-storage = "^2.5.0"
gunicorn = "^21.2.0"
hnswlib = "^0.8.0"                                       # Fast KNN vector store.
httpx = "^0.24.1"
instructor = "^0.4.0"
itsdangerous = "^2.1.2"
joblib = "^1.3.1"
orjson = "^3.8.10"                                       # Fast JSON serialization: https://fastapi.tiangolo.com/advanced/custom-response/#use-orjsonresponse
pandas = "^2.0"
pillow = "^10.2.0"                                       # Image processing.
psutil = "^5.9.5"
pyarrow = "^14.0.1"
pydantic = "^2.5.2"
python = ">=3.9,<4.0"
python-dotenv = "^1.0.0"
pyyaml = "^6.0.1"
requests = "^2"
scikit-learn = "^1.3.0"
tenacity = "^8.2.2"
tqdm = "^4.66.1"
typing-extensions = "4.9.0"
uvicorn = { extras = ["standard"], version = "^0.23.2" }
tiktoken = "^0.5.1"
loky = "^3.4.1"
cloudpickle = "^2.0.0"
modal = "^0.56.4396"
jinja2 = "^3.1.3"                                        # Used for directory listing via /_data

### Optional dependencies. ###

# LLM providers.
cohere = { version = "^4.32", optional = true }
openai = { version = "^1.7.1", optional = true }
sentence-transformers = { version = "^2.3.1", optional = true } # SBERT on-device embeddings.
FlagEmbedding = { version = "^1.2.3", optional = true }         # bge on-device embeddings.
transformers = { version = "^4.37.2", optional = true }         # bge on-device embeddings.
einops = { version = "^0.7.0", optional = true }                # Nomic on-device embeddings.

# Gmail source.
email-reply-parser = { version = "^0.5.12", optional = true }
google-api-python-client = { version = "^2.88.0", optional = true }
google-auth-httplib2 = { version = "^0.1.0", optional = true }
google-auth-oauthlib = { version = "^1.0.0", optional = true }

# Text statistics.
textacy = { version = "^0.13.0", optional = true }

# Named entity recognition.
spacy = { version = "^3.5.1", optional = true }

# For PII and secrets.
detect-secrets = { version = "^1.4.0", optional = true }
presidio_analyzer = { version = "^2.2", optional = true }

# For language detection.
langdetect = { version = "^1.0.9", optional = true }

# Langsmith source ingestion.
langsmith = { version = "^0.0.41", optional = true }

# LlamaIndex for GitHub source ingestion.
llama-index = { version = "^0.10", optional = true, python = ">=3.9,<3.12" }
llama-hub = { version = "^0.0.67", optional = true, python = ">=3.9,<3.12" }

# For HDBScan to reduce dimensionality before running clustering.
umap-learn = { version = "^0.5.4", optional = true }
hdbscan = { version = "^0.8.33", optional = true }

[tool.poetry.extras]
all = [
  "cohere",
  "detect-secrets",
  "email-reply-parser",
  "FlagEmbedding",
  "google-api-python-client",
  "google-auth-httplib2",
  "google-auth-oauthlib",
  "hdbscan",
  "langdetect",
  "langsmith",
  "llama-hub",
  "llama-index",
  "nomic",
  "openai",
  "presidio_analyzer",
  "sentence-transformers",
  "spacy",
  "textacy",
  "transformers",
  "umap-learn",
]

embeddings = ["cohere", "openai", "sentence-transformers"]
llms = ["openai"]

# Sources.
sources = ["gmail", "langsmith"]

## Individual sources.
gmail = [
  "email-reply-parser",
  "google-api-python-client",
  "google-auth-httplib2",
  "google-auth-oauthlib",
]
langsmith = ["langsmith"]
github = ["llama-index", "llama-hub"]

# Signals.
signals = ["textacy", "detect-secrets", "langdetect", "hdbscan"]

## Individual signals.
lang_detection = ["langdetect"]               # Language detection.
pii = ["detect-secrets", "presidio_analyzer"] # PII.
text_stats = ["textacy"]                      # Text statistics.

# Individual embeddings.
gte = ["sentence-transformers"]
bge = ["FlagEmbedding", "transformers"]
nomic = ["sentence-transformers", "einops"]
sbert = ["sentence-transformers"]
cohere = ["cohere"]
openai = ["openai"]

[tool.poetry.group.dev] # Deps for development.
optional = true

[tool.poetry.group.dev.dependencies]
devtools = "^0.12.2"
google-api-python-client-stubs = "^1.13.0"
httpx = "^0.24.0"
huggingface-hub = "^0.20.2"
matplotlib = "^3.7.1"
mypy = "^1.8.0"
notebook = "^7.0.4"
pytest = "^7.2.0"
pytest-asyncio = "^0.20.2"
pytest-cov = "^4.0.0"
pytest-mock = "^3.10.0"
ruff = "^0.1.3"
setuptools = "^65.5.0"
toml = "^0.10.2"
types-Pillow = "^10.2.0"
types-cachetools = "^5.3.0.5"
types-regex = "^2023.6.3.0"
types-requests = "^2.28.11.5"
types-tqdm = "^4.65.0.0"
types-psutil = "^5.9.5.12"
watchdog = { extras = ["watchmedo"], version = "^3.0.0" }
wheel = "^0.37.1"
freezegun = "^1.2.2"
line-profiler = "^4.1.1"
pytest-repeat = "^0.9.3"
memray = "^1.10.0"
tuna = "^0.5.11"


[tool.poetry.scripts]
lilac = "lilac.cli:cli"

[build-system]
build-backend = "poetry.core.masonry.api"
requires = ["poetry-core"]

[tool.pyright]
include = ["lilac"]

[tool.ruff]
src = ["lilac"]
line-length = 100
indent-width = 2

# Enable Pyflakes `F`, pycodestyle "W" and `E`, "Q" quotes, "I" imports.
fix = true
ignore = [
  "D105", # Missing docstring in magic method.
  "D106", # Missing docstring in a public nested class.
  "D107", # Missing docstring in __init__.
  "D203", # 1 blank line required before class docstring.
  "D204", # 1 blank line required after class docstring.
  "D213", # Multi-line docstring summary should start at the second line.
  "D401", # First line of docstring should be in imperative mood: "{first_line}"
  "D407", # Missing dashed underline after section.
]
select = ["E", "W", "F", "Q", "I", "D"]

# Exclude a variety of commonly ignored directories.
exclude = [
  "__pycache__",
  ".bzr",
  ".direnv",
  ".eggs",
  ".git",
  ".hg",
  ".mypy_cache",
  ".nox",
  ".pants.d",
  ".ruff_cache",
  ".svn",
  ".tox",
  ".venv",
  "__pypackages__",
  "_build",
  "buck-out",
  "build",
  "dist",
  "node_modules",
  "venv",
]

# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

# Assume Python 3.11.
target-version = "py311"

[tool.ruff.format]
quote-style = "single"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"

[tool.ruff.mccabe]
# Unlike Flake8, default to a complexity level of 10.
max-complexity = 10

[tool.ruff.flake8-quotes]
inline-quotes = "single"

[tool.ruff.per-file-ignores]
"*_test.py" = ["D101", "D102", "D103", "D104"]
"__init__.py" = ["D104"]

[tool.ruff.pep8-naming]
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
classmethod-decorators = ["classmethod", "pydantic.validator"]

[tool.ruff.lint.pydocstyle]
convention = "google"

# Consider enabling these options for more compact code.
# split_before_named_assigns = false