Skip to content

Commit

Permalink
Merge pull request #5 from sahajsoft/flair
Browse files Browse the repository at this point in the history
Adding flair recognizer and anonymizer base code
  • Loading branch information
akshaykarle authored Apr 25, 2024
2 parents 5fe4053 + 9e847e3 commit aafbc8b
Show file tree
Hide file tree
Showing 23 changed files with 4,067 additions and 1 deletion.
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
use flake
32 changes: 32 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: "Test and Build"

on: [push]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: "Setup poetry"
uses: abatilo/actions-poetry@v2

- name: "Load cached venv"
id: cached-poetry-dependencies
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ hashFiles('**/flake.lock') }}-${{ hashFiles('**/poetry.lock') }}

- name: "Install python dependencies"
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --no-root
shell: bash

- name: "Test"
run: 'shopt -s globstar && poetry run python -m unittest tests/**/*.py'
shell: bash
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,6 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

.direnv/
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# PII Detection and Anonymiser

This is a service that helps you detect and anonymise PII.

## Prerequisites

Run `./setup.sh` to install all dependencies. This will install [direnv](https://github.com/direnv/direnv/blob/master/docs/installation.md) and [nix](https://nixos.org/download.html) then simply run `direnv allow` to install all build dependencies.

Alternatively, make sure you have [python 3.11](https://www.python.org/downloads/) and [poetry](https://python-poetry.org/docs/#installation) setup on your machine.

## Getting Started

To get started, run the following:

```
poetry install
poetry run python -m unittest tests/**.py
```

## Troubleshooting

There is a chance that `direnv allow` will not load the environment correctly and silently fail. This is observable when you attempt to run `poetry install`, as you will get a `command not found` error in the shell.
To fix this, you need to run the nix commands directly. Run the following:

```
nix --extra-experimental-features 'nix-command flakes' develop
```
This command will create a new Shell instance which has the Nix dependencies loaded. You will need to run commands through this prompt.
Empty file added analyzer_engine/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions analyzer_engine/csv_analyzer_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from presidio_analyzer import AnalyzerEngine
from typing import List, Iterable, Optional

from presidio_analyzer import BatchAnalyzerEngine, DictAnalyzerResult
import csv


class CSVAnalyzerEngine(BatchAnalyzerEngine):

def __init__(self, nlp_engine):
self.nlp_engine = nlp_engine
analyzer_engine = self.create_analyser_engine()
super().__init__(analyzer_engine)

def create_analyser_engine(self):
nlp_engine, registry = self.nlp_engine.create_nlp_engine()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
return analyzer

def analyze_csv(
self,
csv_full_path: str,
language: str,
keys_to_skip: Optional[List[str]] = None,
**kwargs,
) -> Iterable[DictAnalyzerResult]:
with open(csv_full_path, 'r') as csv_file:
csv_list = list(csv.reader(csv_file))
csv_dict = {header: list(map(str, values)) for header, *values in zip(*csv_list)}
analyzer_results = self.analyze_dict(csv_dict, language, keys_to_skip)
return list(analyzer_results)
Empty file added config/__init__.py
Empty file.
37 changes: 37 additions & 0 deletions config/nlp_engine_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from recognizer.flair_recognizer import FlairRecognizer
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider
import spacy


class NLPEngineConfig:

def __init__(self, model_path):
self.model_path = model_path

def create_nlp_engine(self):
pass


class FlairNLPEngine(NLPEngineConfig):
def create_nlp_engine(self):
'''
Flair doesn't have an official NLP Engine. Hence making it as a Recognizer to presidio
:param model_path:
:return:
'''
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
if not spacy.util.is_package("en_core_web_sm"):
spacy.cli.download("en_core_web_sm")
flair_recognizer = FlairRecognizer(model_path=self.model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
registry.add_recognizer(flair_recognizer)
registry.remove_recognizer("SpacyRecognizer")

nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

return nlp_engine, registry
175 changes: 175 additions & 0 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions flake.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
inputs = {
flake-utils.url = "github:numtide/flake-utils";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
poetry2nix = {
url = "github:nix-community/poetry2nix";
inputs.nixpkgs.follows = "nixpkgs";
};
};

outputs = { self, nixpkgs, flake-utils, poetry2nix }:
flake-utils.lib.eachDefaultSystem (system:
let
nativeBuildInputs = with pkgs; [ stdenv python3 poetry ];
buildInputs = with pkgs; [ ];

# see https://github.com/nix-community/poetry2nix/tree/master#api for more functions and examples.
pkgs = nixpkgs.legacyPackages.${system};
inherit (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; })
mkPoetryApplication;
in {
inherit nativeBuildInputs buildInputs;

packages = {
myapp = mkPoetryApplication {
projectDir = self;
python = pkgs.python3;
};
default = self.packages.${system}.myapp;
};

devShells = {
default = pkgs.mkShell {
packages = nativeBuildInputs ++ buildInputs;
LD_LIBRARY_PATH = if pkgs.stdenv.isLinux then
"${pkgs.stdenv.cc.cc.lib}/lib:/run/opengl-driver/lib:/run/opengl-driver-32/lib"
else
"$LD_LIBRARY_PATH";
};
};
});
}
Loading

0 comments on commit aafbc8b

Please sign in to comment.