Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(pipeline): generate main dag from dbt, using cosmos #289

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion datawarehouse/.dockerignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
*

!/docker-entrypoint-initdb.d
!/requirements
!/processings

# Python
**/*.py[cod]
**/__pycache__/
**/.venv/
**/build/
**/dist/
**/.tox/
**/.pytest_cache/
**/*.egg-info/
**/db.sqlite3
11 changes: 7 additions & 4 deletions datawarehouse/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ RUN apt-get update \
&& apt-get clean -y \
&& rm -rf /var/lib/apt/lists/*

RUN python3.11 -m venv ${VIRTUAL_ENV}
RUN pip install --no-cache-dir --upgrade pip setuptools wheel

COPY ./docker-entrypoint-initdb.d /docker-entrypoint-initdb.d

RUN python3.11 -m venv ${VIRTUAL_ENV}
COPY processings/requirements processings/requirements
RUN pip install --no-cache-dir -r processings/requirements/requirements.txt

COPY requirements requirements
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
RUN pip install --no-cache-dir -r requirements/requirements.txt
COPY processings processings
RUN pip install --no-cache-dir -e processings
18 changes: 18 additions & 0 deletions datawarehouse/processings/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
PIP_COMPILE := pipx run uv pip compile pyproject.toml --quiet

ifeq ($(filter upgrade,$(MAKECMDGOALS)),upgrade)
PIP_COMPILE += --upgrade
endif

.PHONY: all dev base test uv upgrade

all: base dev test

base:
$(PIP_COMPILE) --output-file=requirements/requirements.txt

dev:
$(PIP_COMPILE) --extra=dev --output-file=requirements/dev-requirements.txt

test:
$(PIP_COMPILE) --extra=test --output-file=requirements/test-requirements.txt
47 changes: 47 additions & 0 deletions datawarehouse/processings/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools", "wheel"]

[project]
name = "data-inclusion-processings"
version = "0.1.0"
dependencies = [
"numpy~=2.0",
"pandas~=2.2",
"requests~=2.31",
"trafilatura~=1.6",
"tenacity",
]

[project.optional-dependencies]
dev = [
"pre-commit~=2.20",
"ruff~=0.2",
]
test = [
"pytest",
]

[tool.setuptools]
package-dir = {"" = "src"}

[tool.ruff.lint]
# see prefixes in https://beta.ruff.rs/docs/rules/
select = [
"F", # pyflakes
"E", # pycodestyle errors
"W", # pycodestyle warnings
"I", # isort
"UP", # pyupgrade
"S", # bandit
]

[tool.ruff.lint.isort]
combine-as-imports = true
known-first-party = ["data_inclusion"]

[tool.pytest.ini_options]
testpaths = "tests"
markers = '''
ban_api: mark test as requiring the base base adresse nationale api
'''
88 changes: 88 additions & 0 deletions datawarehouse/processings/requirements/dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml --extra=dev --output-file=requirements/dev-requirements.txt
babel==2.16.0
# via courlan
certifi==2024.7.4
# via
# requests
# trafilatura
cfgv==3.4.0
# via pre-commit
charset-normalizer==3.3.2
# via
# htmldate
# requests
# trafilatura
courlan==1.3.0
# via trafilatura
dateparser==1.2.0
# via htmldate
distlib==0.3.8
# via virtualenv
filelock==3.15.4
# via virtualenv
htmldate==1.8.1
# via trafilatura
identify==2.6.0
# via pre-commit
idna==3.7
# via requests
justext==3.0.1
# via trafilatura
lxml==5.3.0
# via
# htmldate
# justext
# lxml-html-clean
# trafilatura
lxml-html-clean==0.2.0
# via lxml
nodeenv==1.9.1
# via pre-commit
numpy==2.0.1
# via
# data-inclusion-processings (pyproject.toml)
# pandas
pandas==2.2.2
# via data-inclusion-processings (pyproject.toml)
platformdirs==4.2.2
# via virtualenv
pre-commit==2.21.0
# via data-inclusion-processings (pyproject.toml)
python-dateutil==2.9.0.post0
# via
# dateparser
# htmldate
# pandas
pytz==2024.1
# via
# dateparser
# pandas
pyyaml==6.0.2
# via pre-commit
regex==2024.7.24
# via dateparser
requests==2.32.3
# via data-inclusion-processings (pyproject.toml)
ruff==0.5.7
# via data-inclusion-processings (pyproject.toml)
six==1.16.0
# via python-dateutil
tenacity==9.0.0
# via data-inclusion-processings (pyproject.toml)
tld==0.13
# via courlan
trafilatura==1.12.0
# via data-inclusion-processings (pyproject.toml)
tzdata==2024.1
# via pandas
tzlocal==5.2
# via dateparser
urllib3==2.2.2
# via
# courlan
# htmldate
# requests
# trafilatura
virtualenv==20.26.3
# via pre-commit
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements/requirements.in
#
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml --output-file=requirements/requirements.txt
certifi==2023.7.22
# via
# requests
Expand All @@ -30,22 +26,35 @@ lxml==4.9.3
# htmldate
# justext
# trafilatura
numpy==2.0.1
# via
# data-inclusion-processings (pyproject.toml)
# pandas
pandas==2.2.2
# via data-inclusion-processings (pyproject.toml)
python-dateutil==2.8.2
# via
# dateparser
# htmldate
# pandas
pytz==2023.3.post1
# via dateparser
# via
# dateparser
# pandas
regex==2023.8.8
# via dateparser
requests==2.31.0
# via -r requirements/requirements.in
# via data-inclusion-processings (pyproject.toml)
six==1.16.0
# via python-dateutil
tenacity==9.0.0
# via data-inclusion-processings (pyproject.toml)
tld==0.13
# via courlan
trafilatura==1.6.2
# via -r requirements/requirements.in
# via data-inclusion-processings (pyproject.toml)
tzdata==2024.1
# via pandas
tzlocal==5.0.1
# via dateparser
urllib3==2.0.4
Expand Down
76 changes: 76 additions & 0 deletions datawarehouse/processings/requirements/test-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml --extra=test --output-file=requirements/test-requirements.txt
babel==2.16.0
# via courlan
certifi==2024.7.4
# via
# requests
# trafilatura
charset-normalizer==3.3.2
# via
# htmldate
# requests
# trafilatura
courlan==1.3.0
# via trafilatura
dateparser==1.2.0
# via htmldate
htmldate==1.8.1
# via trafilatura
idna==3.7
# via requests
iniconfig==2.0.0
# via pytest
justext==3.0.1
# via trafilatura
lxml==5.3.0
# via
# htmldate
# justext
# lxml-html-clean
# trafilatura
lxml-html-clean==0.2.0
# via lxml
numpy==2.0.1
# via
# data-inclusion-processings (pyproject.toml)
# pandas
packaging==24.1
# via pytest
pandas==2.2.2
# via data-inclusion-processings (pyproject.toml)
pluggy==1.5.0
# via pytest
pytest==8.3.2
# via data-inclusion-processings (pyproject.toml)
python-dateutil==2.9.0.post0
# via
# dateparser
# htmldate
# pandas
pytz==2024.1
# via
# dateparser
# pandas
regex==2024.7.24
# via dateparser
requests==2.32.3
# via data-inclusion-processings (pyproject.toml)
six==1.16.0
# via python-dateutil
tenacity==9.0.0
# via data-inclusion-processings (pyproject.toml)
tld==0.13
# via courlan
trafilatura==1.12.0
# via data-inclusion-processings (pyproject.toml)
tzdata==2024.1
# via pandas
tzlocal==5.2
# via dateparser
urllib3==2.2.2
# via
# courlan
# htmldate
# requests
# trafilatura
44 changes: 44 additions & 0 deletions datawarehouse/processings/scripts/create_udfs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

set -e

# perform all actions as $POSTGRES_USER
export PGUSER="$POSTGRES_USER"

psql --dbname="$POSTGRES_DB" <<- 'EOSQL'
CREATE SCHEMA IF NOT EXISTS processings;
EOSQL


psql --dbname="$POSTGRES_DB" <<- 'EOSQL'
SET search_path TO processings;

DROP FUNCTION IF EXISTS geocode;
CREATE OR REPLACE FUNCTION geocode(
data JSONB,
batch_size INT DEFAULT 1000
)
RETURNS
TABLE(
id TEXT,
result_score FLOAT,
result_label TEXT,
result_city TEXT,
result_type TEXT,
result_citycode TEXT,
result_postcode TEXT
)
AS $$

import json

from data_inclusion import processings

return (
processings.geocode(data=json.loads(data), batch_size=batch_size)
if data is not None
else []
)

$$ LANGUAGE plpython3u;
EOSQL
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from data_inclusion.processings.geocode import geocode

__all__ = [
"geocode",
]
Loading
Loading