-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from sahajsoft/flair
Adding flair recognizer and anonymizer base code
- Loading branch information
Showing
23 changed files
with
4,067 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
use flake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
name: "Test and Build" | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
|
||
- uses: actions/setup-python@v5 | ||
with: | ||
python-version: "3.11" | ||
|
||
- name: "Setup poetry" | ||
uses: abatilo/actions-poetry@v2 | ||
|
||
- name: "Load cached venv" | ||
id: cached-poetry-dependencies | ||
uses: actions/cache@v4 | ||
with: | ||
path: .venv | ||
key: venv-${{ runner.os }}-${{ hashFiles('**/flake.lock') }}-${{ hashFiles('**/poetry.lock') }} | ||
|
||
- name: "Install python dependencies" | ||
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' | ||
run: poetry install --no-interaction --no-root | ||
shell: bash | ||
|
||
- name: "Test" | ||
run: 'shopt -s globstar && poetry run python -m unittest tests/**/*.py' | ||
shell: bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# PII Detection and Anonymiser | ||
|
||
This is a service that helps you detect and anonymise PII. | ||
|
||
## Prerequisites | ||
|
||
Run `./setup.sh` to install all dependencies. This will install [direnv](https://github.com/direnv/direnv/blob/master/docs/installation.md) and [nix](https://nixos.org/download.html) then simply run `direnv allow` to install all build dependencies. | ||
|
||
Alternatively, make sure you have [python 3.11](https://www.python.org/downloads/) and [poetry](https://python-poetry.org/docs/#installation) setup on your machine. | ||
|
||
## Getting Started | ||
|
||
To get started, run the following: | ||
|
||
``` | ||
poetry install | ||
poetry run python -m unittest tests/**.py | ||
``` | ||
|
||
## Troubleshooting | ||
|
||
There is a chance that `direnv allow` will not load the environment correctly and silently fail. This is observable when you attempt to run `poetry install`, as you will get a `command not found` error in the shell. | ||
To fix this, you need to run the nix commands directly. Run the following: | ||
|
||
``` | ||
nix --extra-experimental-features 'nix-command flakes' develop | ||
``` | ||
This command will create a new Shell instance which has the Nix dependencies loaded. You will need to run commands through this prompt. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from presidio_analyzer import AnalyzerEngine | ||
from typing import List, Iterable, Optional | ||
|
||
from presidio_analyzer import BatchAnalyzerEngine, DictAnalyzerResult | ||
import csv | ||
|
||
|
||
class CSVAnalyzerEngine(BatchAnalyzerEngine): | ||
|
||
def __init__(self, nlp_engine): | ||
self.nlp_engine = nlp_engine | ||
analyzer_engine = self.create_analyser_engine() | ||
super().__init__(analyzer_engine) | ||
|
||
def create_analyser_engine(self): | ||
nlp_engine, registry = self.nlp_engine.create_nlp_engine() | ||
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) | ||
return analyzer | ||
|
||
def analyze_csv( | ||
self, | ||
csv_full_path: str, | ||
language: str, | ||
keys_to_skip: Optional[List[str]] = None, | ||
**kwargs, | ||
) -> Iterable[DictAnalyzerResult]: | ||
with open(csv_full_path, 'r') as csv_file: | ||
csv_list = list(csv.reader(csv_file)) | ||
csv_dict = {header: list(map(str, values)) for header, *values in zip(*csv_list)} | ||
analyzer_results = self.analyze_dict(csv_dict, language, keys_to_skip) | ||
return list(analyzer_results) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from recognizer.flair_recognizer import FlairRecognizer | ||
from presidio_analyzer import RecognizerRegistry | ||
from presidio_analyzer.nlp_engine import NlpEngineProvider | ||
import spacy | ||
|
||
|
||
class NLPEngineConfig: | ||
|
||
def __init__(self, model_path): | ||
self.model_path = model_path | ||
|
||
def create_nlp_engine(self): | ||
pass | ||
|
||
|
||
class FlairNLPEngine(NLPEngineConfig): | ||
def create_nlp_engine(self): | ||
''' | ||
Flair doesn't have an official NLP Engine. Hence making it as a Recognizer to presidio | ||
:param model_path: | ||
:return: | ||
''' | ||
registry = RecognizerRegistry() | ||
registry.load_predefined_recognizers() | ||
if not spacy.util.is_package("en_core_web_sm"): | ||
spacy.cli.download("en_core_web_sm") | ||
flair_recognizer = FlairRecognizer(model_path=self.model_path) | ||
nlp_configuration = { | ||
"nlp_engine_name": "spacy", | ||
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], | ||
} | ||
registry.add_recognizer(flair_recognizer) | ||
registry.remove_recognizer("SpacyRecognizer") | ||
|
||
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() | ||
|
||
return nlp_engine, registry |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{ | ||
inputs = { | ||
flake-utils.url = "github:numtide/flake-utils"; | ||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; | ||
poetry2nix = { | ||
url = "github:nix-community/poetry2nix"; | ||
inputs.nixpkgs.follows = "nixpkgs"; | ||
}; | ||
}; | ||
|
||
outputs = { self, nixpkgs, flake-utils, poetry2nix }: | ||
flake-utils.lib.eachDefaultSystem (system: | ||
let | ||
nativeBuildInputs = with pkgs; [ stdenv python3 poetry ]; | ||
buildInputs = with pkgs; [ ]; | ||
|
||
# see https://github.com/nix-community/poetry2nix/tree/master#api for more functions and examples. | ||
pkgs = nixpkgs.legacyPackages.${system}; | ||
inherit (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) | ||
mkPoetryApplication; | ||
in { | ||
inherit nativeBuildInputs buildInputs; | ||
|
||
packages = { | ||
myapp = mkPoetryApplication { | ||
projectDir = self; | ||
python = pkgs.python3; | ||
}; | ||
default = self.packages.${system}.myapp; | ||
}; | ||
|
||
devShells = { | ||
default = pkgs.mkShell { | ||
packages = nativeBuildInputs ++ buildInputs; | ||
LD_LIBRARY_PATH = if pkgs.stdenv.isLinux then | ||
"${pkgs.stdenv.cc.cc.lib}/lib:/run/opengl-driver/lib:/run/opengl-driver-32/lib" | ||
else | ||
"$LD_LIBRARY_PATH"; | ||
}; | ||
}; | ||
}); | ||
} |
Oops, something went wrong.