From f876ac1b9af2049dca9e3177583d11ecf52fe9f1 Mon Sep 17 00:00:00 2001 From: Daniele Guido Date: Mon, 18 Nov 2024 15:11:53 +0100 Subject: [PATCH] feature/bitmap solr (#76) * Create Github workflow test.yml * add tests for job models and bitmap check with in-memory sqlite db * update settings IMPRESSO_SOLR_FIELDS and add IMPRESSO_SOLR_FIELDS _AS_LIST. Change django command accordingly * move Solr helpers function to specific `imrpesso.utils.solr` module * use dotenv lib to retrieve variables * lint with black and add documentation to Profile Model * Update tasks.py * upgrade to django 5.1.3 * improve admin for user bitmaps * finalize export csv * update progress method to send a more structured message --------- Co-authored-by: Daniele Guido <1181642+danieleguido@users.noreply.github.com> --- .github/workflows/test.yml | 53 +++++ Pipfile | 3 +- Pipfile.lock | 43 ++-- impresso/admin.py | 42 ++-- impresso/base.py | 65 ++++--- .../management/commands/checksystemhealth.py | 38 ++-- .../management/commands/exportqueryascsv.py | 52 +++-- impresso/models/job.py | 111 +++++------ impresso/models/profile.py | 46 +++-- impresso/models/userBitmap.py | 151 ++++++++------- impresso/settings.py | 41 +++- impresso/solr.py | 85 ++++---- impresso/tasks.py | 114 +++++++---- impresso/tests/__init__.py | 0 impresso/tests/models/__init__.py | 0 impresso/tests/models/test_job.py | 80 ++++++++ impresso/tests/models/test_userBitmap.py | 183 ++++++++++++++++++ impresso/tests/test_solr.py | 108 +++++++++++ impresso/tests/utils/__init__.py | 0 impresso/tests/utils/tasks/__init__.py | 0 impresso/tests/utils/test_bitmask.py | 50 +++++ impresso/utils/bitmap.py | 114 +++++++++-- impresso/utils/bitmask.py | 59 ++++++ impresso/utils/solr.py | 107 ++++++++++ impresso/utils/tasks/__init__.py | 111 ++++------- impresso/utils/tasks/export.py | 32 +-- 26 files changed, 1252 insertions(+), 436 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 impresso/tests/__init__.py create mode 100644 impresso/tests/models/__init__.py create mode 100644 impresso/tests/models/test_job.py create mode 100644 impresso/tests/models/test_userBitmap.py create mode 100644 impresso/tests/test_solr.py create mode 100644 impresso/tests/utils/__init__.py create mode 100644 impresso/tests/utils/tasks/__init__.py create mode 100644 impresso/tests/utils/test_bitmask.py create mode 100644 impresso/utils/bitmask.py create mode 100644 impresso/utils/solr.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..cfed42c --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,53 @@ +name: Tests +on: + push: + branches: ['develop'] + pull_request: + branches: ['develop'] +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Check out the code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12.4' + + - name: Install pipenv + run: pip install pipenv + + - name: Install dependencies + run: pipenv install --dev + + - name: Add a .env file + run: | + echo "SECRET_KEY=ThisisaVeryverysecretkey" >> .env + echo "DEBUG=True" >> .env + echo "ALLOWED_HOSTS=localhost" >> .env + echo "CSRF_TRUSTED_ORIGINS=http://localhost" >> .env + echo "IMPRESSO_DB_ENGINE=mysal" >> .env + echo "IMPRESSO_DB_NAME=xxxxxxxxxxxxxxxxxx" >> .env + echo "IMPRESSO_DB_USER=xxxxxxxxxxxxxxxxxx" >> .env + echo "IMPRESSO_DB_PASSWORD=xxxxxxxxxxxxxx" >> .env + echo "IMPRESSO_DB_HOST=localhost" >> .env + echo "IMPRESSO_DB_PORT=0000" >> .env + echo "IMPRESSO_SOLR_URL=http://localhost:8983/solr" >> .env + echo "IMPRESSO_SOLR_PASSAGES_URL=http://localhost:8983/solr/passages" >> .env + echo "IMPRESSO_SOLR_USER=ssssssssssssssss" >> .env + echo "IMPRESSO_SOLR_USER_WRITE=ssssssssss" >> .env + echo "IMPRESSO_SOLR_PASSWORD=ssssssssssss" >> .env + echo "IMPRESSO_SOLR_PASSWORD_WRITE=ssssss" >> .env + + - name: Set up debug logging, this requires a secific debug folder + run: | + mkdir -p logs + touch logs/debug.log + + - name: Run Django tests + run: pipenv run ./manage.py test + env: + DJANGO_SETTINGS_MODULE: impresso.settings # replace with your actual settings module diff --git a/Pipfile b/Pipfile index 5d223f4..4af09c6 100644 --- a/Pipfile +++ b/Pipfile @@ -9,10 +9,11 @@ pip = "*" celery = "*" requests = "*" redis = "*" -django = "==5.0.8" +django = "==5.1.3" pymysql = "*" django-registration = "*" gunicorn = "*" +python-dotenv = "*" [dev-packages] "flake8" = "*" diff --git a/Pipfile.lock b/Pipfile.lock index dd23836..9d48034 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "8210f1c5da4fd6800c6cea9138ac29b12bcfcaeb24f3347d4158efe5e29ae38f" + "sha256": "30c87c97a2b6ce88c62c01523ff6727b07391e9dff6ab71ecc1c94ebd126d2c9" }, "pipfile-spec": 6, "requires": { @@ -208,21 +208,21 @@ }, "django": { "hashes": [ - "sha256:333a7988f7ca4bc14d360d3d8f6b793704517761ae3813b95432043daec22a45", - "sha256:ebe859c9da6fead9c9ee6dbfa4943b04f41342f4cea2c4d8c978ef0d10694f2b" + "sha256:8b38a9a12da3ae00cb0ba72da985ec4b14de6345046b1e174b1fd7254398f818", + "sha256:c0fa0e619c39325a169208caef234f90baa925227032ad3f44842ba14d75234a" ], "index": "pypi", "markers": "python_version >= '3.10'", - "version": "==5.0.8" + "version": "==5.1.3" }, "django-registration": { "hashes": [ - "sha256:1a0ccef7ef71e67a78a551abd8ad378977dc14a036f1fcd8be422a68bd5254a9", - "sha256:fa76df481189794f47eb73043ee5cc9bfb0963194b901d7bd8cf914beab1ddd0" + "sha256:5e3677e64f39a5d659768b93938870286b90ff500b983fd69153012fabea73fd", + "sha256:e64b0b1d24886fc740122862630ec00b8c2ffd4ab6d0e0a9bced870374d1436b" ], "index": "pypi", - "markers": "python_version >= '3.7'", - "version": "==3.4" + "markers": "python_version >= '3.9'", + "version": "==5.1.0" }, "gunicorn": { "hashes": [ @@ -251,20 +251,20 @@ }, "packaging": { "hashes": [ - "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002", - "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124" + "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", + "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f" ], "markers": "python_version >= '3.8'", - "version": "==24.1" + "version": "==24.2" }, "pip": { "hashes": [ - "sha256:2cd581cf58ab7fcfca4ce8efa6dcacd0de5bf8d0a3eb9ec927e07405f4d9e2a2", - "sha256:5b5e490b5e9cb275c879595064adce9ebd31b854e3e803740b72f9ccf34a45b8" + "sha256:3790624780082365f47549d032f3770eeb2b1e8bd1f7b2e02dace1afa361b4ed", + "sha256:ebcb60557f2aefabc2e0f918751cd24ea0d56d8ec5445fe1807f1d2109660b99" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==24.2" + "version": "==24.3.1" }, "prompt-toolkit": { "hashes": [ @@ -291,6 +291,15 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.9.0.post0" }, + "python-dotenv": { + "hashes": [ + "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", + "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==1.0.1" + }, "pytz": { "hashes": [ "sha256:31cb35c89bd7d333cd32c5f278fca91b523b0834369e757f4c5641ea252236ca", @@ -301,12 +310,12 @@ }, "redis": { "hashes": [ - "sha256:f6c997521fedbae53387307c5d0bf784d9acc28d9f1d058abeac566ec4dbed72", - "sha256:f8ea06b7482a668c6475ae202ed8d9bcaa409f6e87fb77ed1043d912afd62e24" + "sha256:0b1087665a771b1ff2e003aa5bdd354f15a70c9e25d5a7dbf9c722c16528a7b0", + "sha256:ae174f2bb3b1bf2b09d54bf3e51fbc1469cf6c10aa03e21141f51969801a7897" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==5.1.1" + "version": "==5.2.0" }, "requests": { "hashes": [ diff --git a/impresso/admin.py b/impresso/admin.py index ad36156..09b1247 100644 --- a/impresso/admin.py +++ b/impresso/admin.py @@ -4,7 +4,7 @@ from django.contrib.auth.admin import UserAdmin as BaseUserAdmin from django.contrib.auth.models import User from django.utils.translation import ngettext - +from django.utils import timezone from .models import Profile, Issue, Job, Page, Newspaper from .models import SearchQuery, ContentItem from .models import Collection, CollectableItem, Tag, TaggableItem @@ -38,6 +38,7 @@ class UserBitmapAdmin(admin.ModelAdmin): "date_accepted_terms", ) search_fields = ["user__username", "user__email"] + actions = ["set_terms_accepted_date"] def num_subscriptions(self, obj): return obj.subscriptions.count() @@ -45,27 +46,28 @@ def num_subscriptions(self, obj): def bitmap_display(self, obj): if obj.bitmap is None: return "" - return bin(int.from_bytes(obj.bitmap, byteorder="big")) + return bin(obj.get_bitmap_as_int()) def user_plan_display(self, obj): - if obj.bitmap is None: - return "-" - bitmap_int = int.from_bytes(obj.bitmap, byteorder="big") - bitmap_length = bitmap_int.bit_length() - # Extract the first 5 bits - bitmap_plan = ( - bitmap_int >> (bitmap_length - UserBitmap.BITMAP_PLAN_MAX_LENGTH) - ) & 0b11111 - if bitmap_plan == UserBitmap.USER_PLAN_GUEST: - return "Guest" - if bitmap_plan == UserBitmap.USER_PLAN_AUTH_USER: - return "Impresso Registered User" - if bitmap_plan == UserBitmap.USER_PLAN_EDUCATIONAL: - return "Student or Teacher - Educational User" - if bitmap_plan == UserBitmap.USER_PLAN_RESEARCHER: - return "Researcher - Academic User" - - return bin(bitmap_plan) + return obj.get_user_plan() + + @admin.action(description="Accept the terms of use for selected users") + def set_terms_accepted_date(self, request, queryset): + # for each user, do a proper save + updated = queryset.count() + for user_bitmap in queryset: + user_bitmap.date_accepted_terms = timezone.now() + user_bitmap.save() + self.message_user( + request, + ngettext( + "%d user accepted the terms of use.", + "%d users accepted the terms of use.", + updated, + ) + % updated, + messages.SUCCESS, + ) user_plan_display.short_description = "User Plan" diff --git a/impresso/base.py b/impresso/base.py index 0926823..79712a3 100644 --- a/impresso/base.py +++ b/impresso/base.py @@ -1,23 +1,54 @@ import os, re from django.core.exceptions import ImproperlyConfigured from pathlib import Path # python3 only +from dotenv import dotenv_values +from typing import Any, Optional +# # e.g. set ENV=production to get .production.env file +dotenv_filename = ( + ".{0}.env".format(os.environ.get("ENV", "")) if "ENV" in os.environ else ".env" +) +dotenv_path = str(Path(".") / dotenv_filename) +dotenv_dict = dotenv_values(dotenv_path=dotenv_path, verbose=True) + +print(f"Loading env file: \033[94m{dotenv_path}\033[0m") +# check that the file exists +if not os.path.exists(dotenv_path): + raise ImproperlyConfigured("No .env file found at {0}".format(dotenv_path)) + +# for k, v in dotenv_dict.items(): +# print("{0}={1}".format(k, v)) + + +def get_env_variable(var_name: str, default: Optional[Any] = None) -> Any: + """ + Retrieve the value of an environment variable based on the selected environment file. + + The function first checks if the variable is defined in the dotenv file corresponding to the + current environment mode, as determined by the `ENV` setting. If `ENV` is set to a specific value + (e.g., `test`), the function loads variables from `.test.env`. If the variable is not found in + the dotenv file, it then checks the system's environment variables. If still not found, it returns + the `default` value if provided, or raises an error if required. -def dotenv_values(dotenv_path): - lines = [] - with open(dotenv_path) as fp: - lines = fp.read().splitlines() + Environment Modes: + Set `ENV` to specify which dotenv file to load: + - `ENV=production` loads `.production.env`. + - `ENV=test` loads `.test.env`. + - If `ENV` is not set, the default `.env` file may be used. - # get tuples of values,property splitting each line of the file - lines = map(lambda l: tuple(re.split(r"\s*=\s*", l, 1)), filter(None, lines)) - lines = list(lines) - print(f"dotenv_values: found {len(lines)} valid lines") - if not lines: - return dict() - return dict(lines) + Args: + var_name (str): Name of the environment variable to retrieve. + default (Optional[Any]): Value to return if the variable is not found. Defaults to None. + Returns: + Any: The value of the environment variable or the `default` value if not found. -def get_env_variable(var_name, default=None): + Raises: + ImproperlyConfigured: If the environment variable is not found and no `default` is provided. + + Example: + >>> get_env_variable('DATABASE_URL', default='sqlite:///:memory:') + """ if var_name in dotenv_dict: return dotenv_dict[var_name] try: @@ -27,13 +58,3 @@ def get_env_variable(var_name, default=None): return default error_msg = "Set the %s environment variable" % var_name raise ImproperlyConfigured(error_msg) - - -# e.g. set ENV=production to get .production.env file -dotenv_filename = ( - ".{0}.env".format(os.environ.get("ENV", "")) if "ENV" in os.environ else ".env" -) -dotenv_path = str(Path(".") / dotenv_filename) -dotenv_dict = dotenv_values(dotenv_path=dotenv_path) - -print("loading env file: {0}".format(dotenv_filename)) diff --git a/impresso/management/commands/checksystemhealth.py b/impresso/management/commands/checksystemhealth.py index b64f0c5..539025c 100644 --- a/impresso/management/commands/checksystemhealth.py +++ b/impresso/management/commands/checksystemhealth.py @@ -4,17 +4,6 @@ from django.conf import settings from django.db import connection -FLS = [ - "id", - "content_length_i", - "snippet_plain", - "bm_explore_s", - "bm_get_tr_s", - "bm_get_img_s", - "meta_journal_s", - "meta_partnerid_s", -] - class Command(BaseCommand): help = "Check SOLR connectivity" @@ -26,7 +15,11 @@ def handle(self, *args, **options): database_name = cursor.fetchone()[0] self.stdout.write( - f"Current Database: \n \033[94m{database_name}\033[0m\n\n" + f"Current Database: \n" + f" host: \033[94m{settings.DATABASES["default"]["HOST"]}\033[0m\n" + f" port: \033[94m{settings.DATABASES["default"]["PORT"]}\033[0m\n" + f" engine: \033[94m{settings.DATABASES["default"]["ENGINE"]}\033[0m\n" + f" name: \033[94m{database_name}\033[0m\n\n" ) cursor.execute("SHOW TABLES") tables = [t[0] for t in cursor.fetchall()] @@ -41,10 +34,13 @@ def handle(self, *args, **options): self.stderr.write(f"Invalid SOLR URL: {solr_url}") return + self.stdout.write( + f"SOLR fl list (available for export): \n - {'\n - '.join(settings.IMPRESSO_SOLR_FIELDS_AS_LIST)}" + ) params = { "q": "*:*", "rows": 2, - "fl": ",".join(FLS), + "fl": settings.IMPRESSO_SOLR_FIELDS, } solr_response = requests.get( solr_url, @@ -52,20 +48,26 @@ def handle(self, *args, **options): params=params, ) solr_status = solr_response.status_code + self.stdout.write(f"SOLR URL: \n - {solr_url}") self.stdout.write(f"SOLR Status: \n - {solr_status}") + + if solr_status != 200: + self.stderr.write(f"Error: {solr_response.text}") + return + # example result # n of rows in solr solr_num_rows = solr_response.json()["response"]["numFound"] self.stdout.write(f"SOLR Num Rows: \n - {solr_num_rows}") - # example result + docs = solr_response.json()["response"]["docs"] - self.stdout.write(f"\n SOLR Example Docs:") + self.stdout.write(f"SOLR Example Docs:") for doc in docs: - self.stdout.write(f" - \nid:\033[94m{doc.get('id')}\033[0m") + self.stdout.write(f"\n - {doc.get(settings.IMPRESSO_SOLR_ID_FIELD)}") + for field in settings.IMPRESSO_SOLR_FIELDS_AS_LIST: + self.stdout.write(f" ├── {field}: \033[93m{doc.get(field)}\033[0m") - for field in FLS: - self.stdout.write(f" {field}: {doc.get(field)}") # ping redis self.stdout.write("\nChecking Redis connectivity...") import redis diff --git a/impresso/management/commands/exportqueryascsv.py b/impresso/management/commands/exportqueryascsv.py index 2f1c605..ac2bc0a 100644 --- a/impresso/management/commands/exportqueryascsv.py +++ b/impresso/management/commands/exportqueryascsv.py @@ -4,7 +4,9 @@ from impresso.utils.bitmap import check_bitmap_keys_overlap from impresso.tasks import export_query_as_csv from django.conf import settings -from impresso.solr import find_all, solr_doc_to_content_item +from impresso.solr import find_all +from impresso.utils.bitmask import BitMask64, is_access_allowed +from impresso.utils.solr import serialize_solr_doc_content_item_to_plain_dict from impresso.utils.tasks.export import helper_export_query_as_csv_progress @@ -24,8 +26,22 @@ def add_arguments(self, parser): action="store_true", help="Run the function behind the task immediately instead of delaying it with Celery", ) + parser.add_argument( + "--query_hash", + type=str, + help="The hash of the query string, if any, used to identify the query in the database", + ) - def handle(self, user_id, q, no_prompt=False, immediate=False, *args, **options): + def handle( + self, + user_id, + q, + no_prompt=False, + immediate=False, + query_hash="", + *args, + **options, + ): self.stdout.write("\n\n--- Export Solr Query as CSV file ---") self.stdout.write("Params \033[34m❤️\033[0m") self.stdout.write(f" user_id: {user_id}") @@ -55,17 +71,21 @@ def handle(self, user_id, q, no_prompt=False, immediate=False, *args, **options) ) # bitmap try: - user_bitmap = user.bitmap.get_up_to_date_bitmap() + user_bitmap_as_int = user.bitmap.get_bitmap_as_int() except User.bitmap.RelatedObjectDoesNotExist: - user_bitmap = UserBitmap.USER_PLAN_GUEST + user_bitmap_as_int = UserBitmap.USER_PLAN_GUEST self.stdout.write( self.style.WARNING( - f" no bitmap found for user, using default bitmap: {bin(user_bitmap)}" + f" no bitmap found for user, using default bitmap: {bin(user_bitmap_as_int)}" ) ) - self.stdout.write(f" user_current_bitmap: \033[34m{bin(user_bitmap)}\033[0m") + self.stdout.write( + f" user_current_bitmap: \033[34m{bin(user_bitmap_as_int)}\033[0m" + ) + user_bitmap_as_str = BitMask64(user_bitmap_as_int) + self.stdout.write(f" user bitmap as str: \033[34m{user_bitmap_as_str}\033[0m") # bitmap print out as base64 @@ -83,24 +103,23 @@ def handle(self, user_id, q, no_prompt=False, immediate=False, *args, **options) self.stdout.write(f"First document found as example:") first_doc = results["response"]["docs"][0] - first_content_item = solr_doc_to_content_item(first_doc) + first_content_item = serialize_solr_doc_content_item_to_plain_dict(first_doc) for k, v in first_content_item.items(): self.stdout.write(f" {k}: \033[34m{v}\033[0m") # check that user has right to export using the bitmaps - if "_bitmap_get_tr" in first_content_item.keys(): + if "_bm_get_tr_i" in first_doc.keys(): self.stdout.write( "\n\nCheck if user has right to export the first result Transcript using the bitmap" ) # if bitmap is a string of 0 and 1, convert it to int first - first_content_item_bitmap = first_content_item["_bitmap_get_tr"] - user_bitmap_as_str = bin(user_bitmap)[2:] - self.stdout.write(f" user bitmap: \033[34m{user_bitmap_as_str}\033[0m") + first_content_item_bitmap = first_content_item["_bm_get_tr_i"] self.stdout.write( f" content bitmap: \033[34m{first_content_item_bitmap}\033[0m" ) - overlap = check_bitmap_keys_overlap( - user_bitmap_as_str, first_content_item_bitmap + overlap = is_access_allowed( + accessor=user_bitmap_as_str, + content=BitMask64(first_content_item_bitmap), ) if overlap: self.stdout.write( @@ -133,7 +152,10 @@ def handle(self, user_id, q, no_prompt=False, immediate=False, *args, **options) return if not immediate: export_query_as_csv.delay( - query=q, user_id=user_id, description="from command management" + query=q, + user_id=user_id, + description="from command management", + query_hash=query_hash, ) self.stdout.write('"test" task launched, check celery.') self.stdout.write("\n\n---- end ----\n\n") @@ -153,7 +175,7 @@ def handle(self, user_id, q, no_prompt=False, immediate=False, *args, **options) page, loops, progress = helper_export_query_as_csv_progress( job=job, query=q, - query_hash="", + query_hash=query_hash, user_bitmap_key=user_bitmap_as_str, ) self.stdout.write(f"page: {page}, loops: {loops}, progress: {progress}") diff --git a/impresso/models/job.py b/impresso/models/job.py index d5c9331..b13098b 100644 --- a/impresso/models/job.py +++ b/impresso/models/job.py @@ -4,87 +4,76 @@ class Job(models.Model): - BULK_COLLECTION_FROM_QUERY = 'BCQ' - BULK_COLLECTION_FROM_QUERY_TR = 'BCT' - DELETE_COLLECTION = 'DCO' - SYNC_COLLECTION_TO_SOLR = 'IDX' - SYNC_SELECTED_COLLECTABLE_ITEMS_TO_SOLR = 'IDL' - SYNC_COLLECTIONS_TO_SOLR_TR = 'ITR' - EXPORT_COLLECTION_AS_CSV = 'EXP' - EXPORT_QUERY_AS_CSV = 'EXP' - TEST = 'TES' - CREATE_UPLOADED_IMAGE = 'IMG' - REMOVE_FROM_SOLR = 'RDX' - REMOVE_COLLECTIONS_FROM_SOLR_TR = 'RTR' + BULK_COLLECTION_FROM_QUERY = "BCQ" + BULK_COLLECTION_FROM_QUERY_TR = "BCT" + DELETE_COLLECTION = "DCO" + SYNC_COLLECTION_TO_SOLR = "IDX" + SYNC_SELECTED_COLLECTABLE_ITEMS_TO_SOLR = "IDL" + SYNC_COLLECTIONS_TO_SOLR_TR = "ITR" + EXPORT_COLLECTION_AS_CSV = "EXC" + EXPORT_QUERY_AS_CSV = "EXP" + TEST = "TES" + CREATE_UPLOADED_IMAGE = "IMG" + REMOVE_FROM_SOLR = "RDX" + REMOVE_COLLECTIONS_FROM_SOLR_TR = "RTR" TYPE_CHOICES = ( - (BULK_COLLECTION_FROM_QUERY, 'Bulk collection from query'), - (BULK_COLLECTION_FROM_QUERY_TR, 'Bulk collection from query TR passages'), - (DELETE_COLLECTION, 'Delete collection'), - (SYNC_COLLECTION_TO_SOLR, 'Index collection in search engine'), - (SYNC_SELECTED_COLLECTABLE_ITEMS_TO_SOLR, 'Index only collection for a few content items'), # noqa - (EXPORT_COLLECTION_AS_CSV, 'Export collection as CSV'), - (EXPORT_QUERY_AS_CSV, 'Export query as CSV'), - (TEST, '10 minutes countdown, 1 percent every 6 seconds'), - (CREATE_UPLOADED_IMAGE, 'Generate vector signature for the image and store the result in the db'), # noqa - (SYNC_COLLECTIONS_TO_SOLR_TR, 'Sync coll. to related TR passages'), - (REMOVE_FROM_SOLR, 'Remove collection from solr index'), - (REMOVE_COLLECTIONS_FROM_SOLR_TR, 'Remove coll. from SOLR TR index') + (BULK_COLLECTION_FROM_QUERY, "Bulk collection from query"), + (BULK_COLLECTION_FROM_QUERY_TR, "Bulk collection from query TR passages"), + (DELETE_COLLECTION, "Delete collection"), + (SYNC_COLLECTION_TO_SOLR, "Index collection in search engine"), + ( + SYNC_SELECTED_COLLECTABLE_ITEMS_TO_SOLR, + "Index only collection for a few content items", + ), # noqa + (EXPORT_COLLECTION_AS_CSV, "Export collection as CSV"), + (EXPORT_QUERY_AS_CSV, "Export query as CSV"), + (TEST, "10 minutes countdown, 1 percent every 6 seconds"), + ( + CREATE_UPLOADED_IMAGE, + "Generate vector signature for the image and store the result in the db", + ), # noqa + (SYNC_COLLECTIONS_TO_SOLR_TR, "Sync coll. to related TR passages"), + (REMOVE_FROM_SOLR, "Remove collection from solr index"), + (REMOVE_COLLECTIONS_FROM_SOLR_TR, "Remove coll. from SOLR TR index"), ) - READY = 'REA' - RUN = 'RUN' - DONE = 'DON' - ERR = 'ERR' - ARCHIVED = 'ARC' - STOP = 'STO' - RIP = 'RIP' + READY = "REA" + RUN = "RUN" + DONE = "DON" + ERR = "ERR" + ARCHIVED = "ARC" + STOP = "STO" + RIP = "RIP" STATUS_CHOICES = ( - (READY, 'ready'), - (RUN, 'running'), - (DONE, 'Finished, no errors!'), - (ARCHIVED, 'Finished, archived by the user'), - (STOP, 'Please stop'), - (RIP, 'Stopped by user! Rest IN Peace...'), - (ERR, 'Ops, errors!'), + (READY, "ready"), + (RUN, "running"), + (DONE, "Finished, no errors!"), + (ARCHIVED, "Finished, archived by the user"), + (STOP, "Please stop"), + (RIP, "Stopped by user! Rest IN Peace..."), + (ERR, "Ops, errors!"), ) type = models.CharField(max_length=3, choices=TYPE_CHOICES) - status = models.CharField(max_length=3, choices=STATUS_CHOICES, - default=READY) + status = models.CharField(max_length=3, choices=STATUS_CHOICES, default=READY) date_created = models.DateTimeField(auto_now_add=True) date_last_modified = models.DateTimeField(auto_now=True) creator = models.ForeignKey(User, on_delete=models.CASCADE) - extra = models.TextField() - description = models.TextField(default='') + extra = models.TextField(default="{}") - def get_task_meta(self, taskname, progress=0.0, extra={}): - meta = { - 'task': self.type, - 'taskname': taskname, - 'progress': progress, - 'job': { - 'id': self.pk, - 'type': self.type, - 'status': self.status, - 'date_created': self.date_created.isoformat() - }, - 'user_id': self.creator.pk, - 'user_uid': self.creator.profile.uid, - } - meta.update(extra) - return meta + description = models.TextField(default="") def get_progress(self): try: - json.loads(self.extra).get('progress', 0.0) + json.loads(self.extra).get("progress", 0.0) except json.JSONDecodeError: return 0.0 class Meta: - db_table = 'jobs' - verbose_name_plural = 'jobs' + db_table = "jobs" + verbose_name_plural = "jobs" diff --git a/impresso/models/profile.py b/impresso/models/profile.py index eae1f3a..9d9e379 100644 --- a/impresso/models/profile.py +++ b/impresso/models/profile.py @@ -2,6 +2,7 @@ from django.db import models from django.contrib.auth.models import User + class Profile(models.Model): """ Store a few auth related information about a site user and custom info. @@ -9,24 +10,45 @@ class Profile(models.Model): Why? because "Reusable apps shouldn’t implement a custom user model. A project may use many apps, and two reusable apps that implemented a custom user model couldn’t be used together." - + Attributes: + user (models.OneToOneField): A one-to-one relationship with the User model. + uid (models.CharField): A unique identifier for the profile. + provider (models.CharField): The authentication provider, with choices of 'local' or 'Github'. + displayname (models.CharField): The display name of the user, optional. + picture (models.URLField): The URL to the user's profile picture, optional. + pattern (models.CharField): A custom pattern associated with the user, optional. + email_accepted (models.BooleanField): Indicates if the user has accepted email notifications. + max_loops_allowed (models.IntegerField): The maximum number of loops allowed for saving queries. + max_parallel_jobs (models.IntegerField): The maximum number of concurrent running jobs. + Constants: + PROVIDER_LOCAL (str): The local provider constant. + PROVIDER_CHOICES (tuple): The tuple of provider choices. + Meta: + verbose_name_plural (str): The plural name for the model. + db_table (str): The database table name for the model. """ - PROVIDER_LOCAL = 'local' + + PROVIDER_LOCAL = "local" PROVIDER_CHOICES = ( - (PROVIDER_LOCAL, 'local'), - ('Github', 'Github'), + (PROVIDER_LOCAL, "local"), + ("Github", "Github"), ) - user = models.OneToOneField(User, on_delete=models.CASCADE) - uid = models.CharField(max_length=32, unique=True,) - provider = models.CharField(max_length=10, choices=PROVIDER_CHOICES, default=PROVIDER_LOCAL) + user = models.OneToOneField(User, on_delete=models.CASCADE) + uid = models.CharField( + max_length=32, + unique=True, + ) + provider = models.CharField( + max_length=10, choices=PROVIDER_CHOICES, default=PROVIDER_LOCAL + ) # social auth fields displayname = models.CharField(max_length=100, null=True, blank=True) - picture = models.URLField(null=True, blank=True) + picture = models.URLField(null=True, blank=True) # add pattern ;) - pattern = models.CharField(max_length=100, null=True, blank=True) + pattern = models.CharField(max_length=100, null=True, blank=True) # is in mailing list. email_accepted = models.BooleanField(default=False) @@ -36,7 +58,7 @@ class Profile(models.Model): # maximum concurrent running jobs max_parallel_jobs = models.IntegerField(default=2) - + class Meta: - verbose_name_plural = 'profiles' - db_table = 'profiles' + verbose_name_plural = "profiles" + db_table = "profiles" diff --git a/impresso/models/userBitmap.py b/impresso/models/userBitmap.py index e394858..521a3b4 100644 --- a/impresso/models/userBitmap.py +++ b/impresso/models/userBitmap.py @@ -1,8 +1,10 @@ import logging from django.db import models +from django.conf import settings from django.contrib.auth.models import User from .datasetBitmapPosition import DatasetBitmapPosition from django.db.models.signals import m2m_changed +from ..utils.bitmask import int_to_bytes logger = logging.getLogger(__name__) @@ -17,58 +19,59 @@ class UserBitmap(models.Model): # Impresso Registered User impresso Account created, no academic afiliation # Student or Teacher - Educational User educational Account created, educational academic afiliation # Researcher - Academic User researcher Account created, research academic afiliation - USER_PLAN_GUEST = 0b10000 - USER_PLAN_AUTH_USER = 0b11000 - USER_PLAN_EDUCATIONAL = 0b11100 - USER_PLAN_RESEARCHER = 0b11110 + USER_PLAN_GUEST = 0b1 + USER_PLAN_AUTH_USER = 0b11 + USER_PLAN_EDUCATIONAL = 0b111 + USER_PLAN_RESEARCHER = 0b1011 BITMAP_PLAN_MAX_LENGTH = 5 - def get_up_to_date_bitmap(self) -> int: - # if the user hasn't accepted terms of use, return the default bitmap - if not self.date_accepted_terms: - return UserBitmap.USER_PLAN_GUEST + def get_up_to_date_bitmap(self) -> bytes: """ - Get the bitmap using the groups the user is affiliated to and the affiliations to the DatasetBitmapPosition - The four first bits (starting on the left, indices 0-3) are the ones relating to the user plans - Then there is an empy bit (index 4) and the rest of the bits are for the user's subscriptions to the datasets. - The user bitmap relating to user plans is cumulative, hence, any user that is a researcher (bit #3 = 1) has all preceeding - bits also set to 1 : 1111 [archive bits...]. - All users have at least the "guest" bit set to 1 (bit #1): 10000 [archive bits, all 0] + Get the bitmap using the groups the user is affiliated to and the affiliations + to the DatasetBitmapPosition. + All users are 0b1 by default, and the bitmap is updated to 0b11 if the user has accepted the terms of use. + The bitmap is updated to 0b111 if the user is affiliated to the educational group, or to + 0b1101 if the user is affiliated to the researcher group. + The remaining bits are defined by the user's affiliations to the DatasetBitmapPosition. + + Args: + None + + Returns: + bytes: The user's bitmap as a byte array. """ + # if the user hasn't accepted terms of use, return the default bitmap + if not self.date_accepted_terms: + return int_to_bytes(UserBitmap.USER_PLAN_GUEST) + # get all groups the user is affiliated to as flat array, ordered by a-z groups = [group.name for group in self.user.groups.all()] - if "plan-researcher" in groups: - bitmap = UserBitmap.USER_PLAN_RESEARCHER - elif "plan-educational" in groups: - bitmap = UserBitmap.USER_PLAN_EDUCATIONAL + if settings.IMPRESSO_GROUP_USER_PLAN_RESEARCHER in groups: + value = UserBitmap.USER_PLAN_RESEARCHER + elif settings.IMPRESSO_GROUP_USER_PLAN_EDUCATIONAL in groups: + value = UserBitmap.USER_PLAN_EDUCATIONAL else: - bitmap = UserBitmap.USER_PLAN_AUTH_USER + value = UserBitmap.USER_PLAN_AUTH_USER # print current bitmap # print(f"current bitmap: {bitmap:05b}") # get all user subscriptions subscriptions = list(self.subscriptions.values("name", "bitmap_position")) if not subscriptions: - return bitmap - # max bitmap position - max_position = ( - max([x["bitmap_position"] for x in subscriptions]) - + UserBitmap.BITMAP_PLAN_MAX_LENGTH - + 1 - ) - # Shift the initial signature to the left by the max bit position - bitmap = bitmap << max_position - UserBitmap.BITMAP_PLAN_MAX_LENGTH - # print(f"current empty bitmap: {bitmap:05b}") - for subscription in subscriptions: - # Use the bitmap position to set the corresponding bit - position = ( - subscription["bitmap_position"] + UserBitmap.BITMAP_PLAN_MAX_LENGTH - ) - bitmap |= 1 << (max_position - position - 1) - - return bitmap + return int_to_bytes(value) + # Set the bits for each subscription + for s in subscriptions: + value |= 1 << (s["bitmap_position"] + UserBitmap.BITMAP_PLAN_MAX_LENGTH) + + return int_to_bytes(value) def get_bitmap_as_int(self): + """ + Converts the bitmap from bytes to an integer. + + Returns: + int: The bitmap as an integer. + """ return int.from_bytes(self.bitmap, byteorder="big") def get_bitmap_as_key_str(self): @@ -82,24 +85,34 @@ def get_bitmap_as_key_str(self): def get_user_plan(self): if not self.bitmap: - return "GUEST" + return "- (no bitmap)" if not self.date_accepted_terms: - return "GUEST" + return "- (terms not accepted)" + # get the first bits of the bitmap up to the max length bitmap_int = self.get_bitmap_as_int() - bitmap_length = bitmap_int.bit_length() - # Extract the first 5 bits - bitmap_plan = ( - bitmap_int >> (bitmap_length - UserBitmap.BITMAP_PLAN_MAX_LENGTH) - ) & 0b11111 - if bitmap_plan == UserBitmap.USER_PLAN_GUEST: - return "GUEST" - if bitmap_plan == UserBitmap.USER_PLAN_AUTH_USER: - return "AUTH_USER" - if bitmap_plan == UserBitmap.USER_PLAN_EDUCATIONAL: - return "EDUCATIONAL" - if bitmap_plan == UserBitmap.USER_PLAN_RESEARCHER: - return "RESEARCHER" - return "AUTH_USER" + plan = bitmap_int & 0b1111 + if plan == UserBitmap.USER_PLAN_GUEST: + return "guest" + if plan == UserBitmap.USER_PLAN_AUTH_USER: + return "basic" + if plan == UserBitmap.USER_PLAN_EDUCATIONAL: + return settings.IMPRESSO_GROUP_USER_PLAN_EDUCATIONAL + if plan == UserBitmap.USER_PLAN_RESEARCHER: + return settings.IMPRESSO_GROUP_USER_PLAN_RESEARCHER + return bin(plan) + + # bitmap_plan = ( + # bitmap_int >> (bitmap_length - UserBitmap.BITMAP_PLAN_MAX_LENGTH) + # ) & 0b11111 + # if bitmap_plan == UserBitmap.USER_PLAN_GUEST: + # return "GUEST" + # if bitmap_plan == UserBitmap.USER_PLAN_AUTH_USER: + # return "AUTH_USER" + # if bitmap_plan == UserBitmap.USER_PLAN_EDUCATIONAL: + # return "EDUCATIONAL" + # if bitmap_plan == UserBitmap.USER_PLAN_RESEARCHER: + # return "RESEARCHER" + # return "AUTH_USER" def __str__(self): bitmap = self.get_bitmap_as_int() @@ -111,43 +124,31 @@ class Meta: def save(self, *args, **kwargs): if not self.date_accepted_terms: - user_bitmap = UserBitmap.USER_PLAN_GUEST - bitmap_bytes = user_bitmap.to_bytes( - (user_bitmap.bit_length() + 7) // 8, byteorder="big" - ) - self.bitmap = bitmap_bytes + self.bitmap = int_to_bytes(UserBitmap.USER_PLAN_GUEST) + else: + self.bitmap = self.get_up_to_date_bitmap() super().save(*args, **kwargs) -def update_user_bitmap(sender, instance, action, **kwargs): - if action == "post_add" or action == "post_remove": +def update_user_bitmap_on_subscriptions_changed(sender, instance, action, **kwargs): + if action == "post_add" or action == "post_remove" or action == "post_clear": logger.info(f"User {instance.user} subscription changed, updating") - user_bitmap = instance.get_up_to_date_bitmap() - bitmap_bytes = user_bitmap.to_bytes( - (user_bitmap.bit_length() + 7) // 8, byteorder="big" - ) - instance.bitmap = bitmap_bytes instance.save() - logger.info( - f"User {instance.user} subscription changed, bitmap updated to {user_bitmap:05b}" - ) -def update_user_bitmap_on_user_groups_changed(sender, instance, action, **kwargs): - if action == "post_add" or action == "post_remove": +def update_user_bitmap_on_user_groups_changed( + sender, instance: User, action, **kwargs +) -> None: + if action == "post_add" or action == "post_remove" or action == "post_clear": user_bitmap, created = UserBitmap.objects.get_or_create(user=instance) logger.info( f"User {instance} groups changed. {'Creating new bitmap.' if created else 'Updating bitmap.'}" ) - bitmap = user_bitmap.get_up_to_date_bitmap() - bitmap_bytes = bitmap.to_bytes((bitmap.bit_length() + 7) // 8, byteorder="big") - user_bitmap.bitmap = bitmap_bytes user_bitmap.save() - logger.info(f"User {instance} groups changed, bitmap updated to {bitmap:05b}") m2m_changed.connect( - update_user_bitmap, + update_user_bitmap_on_subscriptions_changed, sender=UserBitmap.subscriptions.through, ) diff --git a/impresso/settings.py b/impresso/settings.py index 6d261b2..b435c63 100644 --- a/impresso/settings.py +++ b/impresso/settings.py @@ -75,8 +75,20 @@ "PASSWORD": get_env_variable("IMPRESSO_DB_PASSWORD"), "HOST": get_env_variable("IMPRESSO_DB_HOST"), "PORT": get_env_variable("IMPRESSO_DB_PORT"), + "TEST": { + "NAME": get_env_variable("IMPRESSO_DB_NAME_TEST", "impresso_test"), + "ENGINE": get_env_variable( + "IMPRESSO_DB_ENGINE_TEST", "django.db.backends.sqlite3" + ), + }, } } +import sys + +if "test" in sys.argv: + DATABASES["default"]["ENGINE"] = "django.db.backends.sqlite3" + DATABASES["default"]["TEST"]["NAME"] = ":memory:" + DEFAULT_AUTO_FIELD = "django.db.models.AutoField" @@ -165,20 +177,27 @@ "meta_issue_id_s": "issue", "meta_partnerid_s": "content_provider", "meta_topics_s": "newspaper_topics", - "meta_polorient_s": "political_orientation", + "meta_polorient_s": "newspaper_political_orientation", "olr_b": "is_olr", - "page_id_ss": "pages_uids", + # "page_id_ss": "pages_uids", "page_nb_is": "pages", "nb_pages_i": "nb_pages", - "front_b": "is_on_front", + "front_b": "is_on_front_page", "meta_date_dt": "date", "pers_mentions": "persons_mentioned", "loc_mentions": "locations_mentioned", + "nag_mentions": "newsagencies_mentioned", "access_right_s": "access_right", "score": "relevance", "exportable_plain": "is_content_available", "ucoll_ss": "collections", - "bm_get_tr_s": "_bitmap_get_tr", + "topics_dpfs": "topics", + # "cc_b": "cc_b", + # bitmap keys, we still maintain both for compatibility reasons + "bm_get_tr_s": "_bm_get_tr_s", + "bm_get_tr_i": "_bm_get_tr_i", + # note: `_bin` fields are deprecated as it would require a custom JSONEncoder (and regexp within the raw_decode, which is not the best idea) + # "bm_get_tr_bin": "_bm_get_tr_s_bin", } IMPRESSO_SOLR_URL_SELECT = os.path.join(get_env_variable("IMPRESSO_SOLR_URL"), "select") @@ -198,18 +217,18 @@ IMPRESSO_SOLR_ID_FIELD = get_env_variable("IMPRESSO_SOLR_ID_FIELD", "id") IMPRESSO_SOLR_FIELDS = get_env_variable( "IMPRESSO_SOLR_FIELDS", - "id,item_type_s,meta_journal_s,lg_s,title_txt_de,title_txt_fr,content_txt_de,content_txt_fr,content_length_i,meta_date_dt,meta_year_i,meta_issue_id_s,page_nb_is,nb_pages_i,front_b,meta_country_code_s,pers_mentions,loc_mentions,access_right_s,meta_partnerid_s,exportable_plain,score,ucoll_ss,bm_get_tr_s", + ",".join(IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS.keys()), ) # check that settings.IMPRESSO_SOLR_FIELDS is set according to the fields specified in the mapping # settings.IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS. # raise an error if not -impresso_solr_fields = IMPRESSO_SOLR_FIELDS.split(",") +IMPRESSO_SOLR_FIELDS_AS_LIST = IMPRESSO_SOLR_FIELDS.split(",") # check that every item in impresso_solr_fields is in the keys of IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS impresso_solr_fields_to_article_props_keys = ( IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS.keys() ) -for field in impresso_solr_fields: +for field in IMPRESSO_SOLR_FIELDS_AS_LIST: if field not in impresso_solr_fields_to_article_props_keys: raise ValueError( f"IMPRESSO_SOLR_FIELDS and IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS do not match: check field {field}" @@ -218,7 +237,10 @@ IMPRESSO_SOLR_ARTICLE_PROPS = sorted( list( set( - [IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS.get(x) for x in impresso_solr_fields] + [ + IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS.get(x) + for x in IMPRESSO_SOLR_FIELDS_AS_LIST + ] ) ), key=lambda x: (x != "uid", x), @@ -250,6 +272,9 @@ IMPRESSO_GIT_BRANCH = get_env_variable("IMPRESSO_GIT_BRANCH", "?") IMPRESSO_GIT_REVISION = get_env_variable("IMPRESSO_GIT_REVISION", "?") +IMPRESSO_GROUP_USER_PLAN_EDUCATIONAL = "plan-educational" +IMPRESSO_GROUP_USER_PLAN_RESEARCHER = "plan-researcher" + # Logging LOGGING = { "version": 1, diff --git a/impresso/solr.py b/impresso/solr.py index 68f8e11..ee32f19 100644 --- a/impresso/solr.py +++ b/impresso/solr.py @@ -1,20 +1,41 @@ import requests import json +import logging from django.conf import settings -from typing import Dict, Any +from typing import Dict, Any, Optional, List def find_all( - q="*:*", - fl=settings.IMPRESSO_SOLR_ID_FIELD, - skip=0, - limit=settings.IMPRESSO_SOLR_EXEC_LIMIT, - url=settings.IMPRESSO_SOLR_URL_SELECT, - auth=settings.IMPRESSO_SOLR_AUTH, - logger=None, - sort="id ASC", - fq="", # {!collapse field=ISBN} -): + q: str = "*:*", + fl: str = settings.IMPRESSO_SOLR_ID_FIELD, + skip: int = 0, + limit: int = settings.IMPRESSO_SOLR_EXEC_LIMIT, + url: str = settings.IMPRESSO_SOLR_URL_SELECT, + auth: tuple = settings.IMPRESSO_SOLR_AUTH, + logger: Optional[logging.Logger] = None, + sort: str = "id ASC", + fq: str = "", +) -> Dict[str, Any]: + """ + Execute a query against a Solr instance and return the results. + + Args: + q (str): The query string. Defaults to "*:*". + fl (str): The fields to return. Defaults to settings.IMPRESSO_SOLR_ID_FIELD. + skip (int): The number of records to skip. Defaults to 0. + limit (int): The maximum number of records to return. Defaults to settings.IMPRESSO_SOLR_EXEC_LIMIT. + url (str): The Solr URL to send the request to. Defaults to settings.IMPRESSO_SOLR_URL_SELECT. + auth (tuple): Authentication credentials for Solr. Defaults to settings.IMPRESSO_SOLR_AUTH. + logger (Optional[logging.Logger]): Logger instance for logging. Defaults to None. + sort (str): The sort order of the results. Defaults to "id ASC". + fq (str): The filter query. Defaults to an empty string. + + Returns: + dict: The response from the Solr instance as a dictionary. + + Raises: + requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code. + """ if logger: logger.info("query:{} skip:{}".format(q, skip)) @@ -39,47 +60,25 @@ def find_all( else: print(res.text) raise - return res.json() - - -def solr_doc_to_content_item( - doc: Dict[str, Any], - field_mapping: Dict[str, str] = settings.IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS, -) -> Dict[str, str]: - """ - Convert a Solr document to a content item object as adict - - Args: - doc: Solr document - field_mapping: Mapping between Solr fields and content item properties - - Returns: - dict: Content item object - """ - result: Dict[str, str] = {} - - for k, v in doc.items(): - prop = field_mapping.get(k, None) - if prop is None: - continue - if isinstance(v, list): - result[prop] = ",".join(str(x) for x in v) - elif not result.get(prop, ""): - result[prop] = v - - return result + data = res.json() + return data -def find_collections_by_ids(ids): +def find_collections_by_ids(ids: List[str]) -> List[Dict[str, Any]]: res = find_all( q=" OR ".join(map(lambda id: "id:%s" % id, ids)), fl="id,ucoll_ss,_version_", limit=len(ids), ) - return res.get("response").get("docs") + return res.get("response", {}).get("docs", []) -def update(todos, url=None, auth=settings.IMPRESSO_SOLR_AUTH, logger=None): +def update( + todos: List[Dict[str, Any]], + url: Optional[str] = None, + auth: tuple = settings.IMPRESSO_SOLR_AUTH, + logger: Optional[logging.Logger] = None, +) -> Dict[str, Any]: if logger: logger.info(f"todos n:{len(todos)} for url:{url}") res = requests.post( diff --git a/impresso/tasks.py b/impresso/tasks.py index 98caaac..4289124 100644 --- a/impresso/tasks.py +++ b/impresso/tasks.py @@ -112,7 +112,7 @@ def export_query_as_csv_progress( job_id: int, query: str, search_query_id: int, - user_bitmap_key: str, + user_bitmap_key: int, query_hash: str = "", progress: float = 0.0, skip: int = 0, @@ -130,7 +130,7 @@ def export_query_as_csv_progress( job_id (int): The ID of the job to update. query (str): The query string to execute. search_query_id (int): The ID of the search query. - user_bitmap_key (str): The user bitmap key. + user_bitmap_key (int): The user bitmap key, as int. query_hash (str, optional): The hash of the query. Defaults to an empty string. skip (int, optional): The number of records to skip. Defaults to 0. limit (int, optional): The maximum number of records to retrieve per page. Defaults to 100. @@ -140,13 +140,7 @@ def export_query_as_csv_progress( """ # get the job so that we can update its status job = Job.objects.get(pk=job_id) - extra = { - "query": query_hash, - "search_query_id": search_query_id, - } - if is_task_stopped( - task=self, job=job, progress=progress, extra=extra, logger=logger - ): + if is_task_stopped(task=self, job=job, progress=progress, logger=logger): return page, loops, progress = helper_export_query_as_csv_progress( @@ -161,9 +155,7 @@ def export_query_as_csv_progress( if page < loops: job.status = Job.RUN - update_job_progress( - task=self, job=job, progress=progress, extra=extra, logger=logger - ) + update_job_progress(task=self, job=job, progress=progress, logger=logger) export_query_as_csv_progress.delay( job_id=job.pk, query=query, @@ -174,7 +166,7 @@ def export_query_as_csv_progress( limit=limit, ) else: - update_job_completed(task=self, job=job, extra=extra, logger=logger) + update_job_completed(task=self, job=job, logger=logger) @app.task(bind=True) @@ -206,34 +198,81 @@ def export_query_as_csv( creator_id=user_id, description=description, ) - + attachment = Attachment.create_from_job(job, extension="csv") + # if decri # get user bitmap, if any - try: - user_bitmap_key = job.creator.bitmap.get_bitmap_as_key_str()[:2] - except User.bitmap.RelatedObjectDoesNotExist: - print(job.creator.bitmap) - logger.info(f"[job:{job.pk} user:{user_id}] no bitmap found for user!") - user_bitmap_key = bin(UserBitmap.USER_PLAN_GUEST)[:2] + user_bitmap, created = UserBitmap.objects.get_or_create(user_id=user_id) logger.info( f"[job:{job.pk} user:{user_id}] launched! " - f"query:{query_hash} bitmap:{user_bitmap_key}" + f"- Using bitmap {user_bitmap.get_bitmap_as_int()} (created:{created}) " + f"- attachment:{attachment.pk}" ) - attachment = Attachment.create_from_job(job, extension="csv") - if not search_query_id: - search_query, created = SearchQuery.objects.get_or_create( - id=SearchQuery.generate_id(creator_id=user_id, query=query_hash), - defaults={ - "data": query_hash, - "description": description, - "creator_id": user_id, - }, - ) + update_job_progress( + task=self, + job=job, + taskstate=TASKSTATE_INIT, + progress=0.0, + logger=logger, + extra={"query": query, "query_hash": query_hash}, + ) + + export_query_as_csv_progress.delay( + job_id=job.pk, + query=query, + query_hash=query_hash, + search_query_id=search_query_id, + user_bitmap_key=user_bitmap.get_bitmap_as_int(), + ) + + +@app.task(bind=True) +def export_collection_as_csv( + self, + user_id: int, + collection_id: int, + query: str, + query_hash: str = "", +) -> None: + """ + Initiates a job to export a collection as a CSV file and starts the export_query_as_csv_progress task + like export_query_as_csv. + + Args: + self: The instance of the class. + user_id (int): The ID of the user initiating the export. + collection_id (int): The ID of the collection to be exported. + query (str): The query string to be exported. + query_hash (str, optional): A hash of the query string. Defaults to an empty string. - search_query_id = search_query.pk + Returns: + None + + """ + user_bitmap, created = UserBitmap.objects.get_or_create(user_id=user_id) + try: + collection = Collection.objects.get(pk=collection_id, creator__id=user_id) + except Collection.DoesNotExist: + logger.error(f"[job:{job.pk} user:{user_id}] no collection found for user!") + return + # save current job then start export_query_as_csv task. + job = Job.objects.create( + type=Job.EXPORT_QUERY_AS_CSV, + creator_id=user_id, + description=collection.name, + extra={ + "collection": get_collection_as_obj(collection), + "query": query, + "query_hash": query_hash, + }, + ) + # create empty attachment and attach automatically to the job + attachment = Attachment.create_from_job(job, extension="csv") logger.info( - f"[job:{job.pk} user:{user_id}] started!" - f" search_query_id:{search_query_id} created:{created}, attachment:{attachment.upload.path}" + f"[job:{job.pk} user:{user_id}] launched! " + f"- Using bitmap {user_bitmap.get_bitmap_as_int()} (created:{created}) " + f"- attachment:{attachment.pk} " + f"- query:{query_hash} description:{job.description}" ) # add query to extra. Job status should be INIT @@ -242,10 +281,6 @@ def export_query_as_csv( job=job, taskstate=TASKSTATE_INIT, progress=0.0, - extra={ - "query": query_hash, - "search_query_id": search_query_id, - }, logger=logger, ) @@ -253,8 +288,7 @@ def export_query_as_csv( job_id=job.pk, query=query, query_hash=query_hash, - search_query_id=search_query_id, - user_bitmap_key=user_bitmap_key, + user_bitmap_key=user_bitmap.get_bitmap_as_int(), ) diff --git a/impresso/tests/__init__.py b/impresso/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/impresso/tests/models/__init__.py b/impresso/tests/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/impresso/tests/models/test_job.py b/impresso/tests/models/test_job.py new file mode 100644 index 0000000..29b26e9 --- /dev/null +++ b/impresso/tests/models/test_job.py @@ -0,0 +1,80 @@ +import json +from django.test import TestCase +from django.contrib.auth.models import User +from ...models import Job, Profile +from ...utils.tasks import TASKSTATE_PROGRESS, update_job_progress +from ...utils.tasks import TASKSTATE_SUCCESS, update_job_completed + + +class FakeTask: + type = Job.TEST + name = "Fake Task" + + def update_state(self, state, meta): + pass + + +USER_UID = "local-testuser" + + +class JobTestCase(TestCase): + + def setUp(self): + self.user = User.objects.create_user(username="testuser", password="12345") + self.profile = Profile.objects.create(user=self.user, uid=USER_UID) + # thsi is normally created directly inside the main celery task + # to be tested in a specific test_tasks.py file + self.job = Job.objects.create( + type=Job.TEST, status=Job.READY, creator=self.user + ) + + self.task = FakeTask() + + def test_job_extra_after_task_init(self): + # create a very fake celery task instance + # use the task to update the job progress + update_job_progress( + task=self.task, + job=self.job, + taskstate=TASKSTATE_PROGRESS, + progress=0.1, + message="Task is initialising", + logger=None, + ) + # get the job extra field as a dictionary from textfield + task_meta = json.loads(self.job.extra) + # {'channel': 'local-testuser', 'taskname': 'Fake Task', 'taskstate': 'PROGRESS', 'progress': 0.1, 'message': 'Task is initialising'} + self.assertEqual(task_meta["channel"], USER_UID) + self.assertEqual(task_meta["taskname"], "Fake Task") + self.assertEqual(task_meta["progress"], 0.1) + self.assertEqual(task_meta["taskstate"], TASKSTATE_PROGRESS) + self.assertEqual(task_meta["message"], "Task is initialising") + + # let's simulate progress task + self.job.status = Job.RUN + update_job_progress( + task=self.task, + job=self.job, + taskstate=TASKSTATE_PROGRESS, + progress=0.5, + message="Task is progressing", + logger=None, + ) + task_meta = json.loads(self.job.extra) + self.assertEqual(task_meta["channel"], USER_UID) + self.assertEqual(task_meta["taskname"], "Fake Task") + self.assertEqual(task_meta["progress"], 0.5) + self.assertEqual(task_meta["taskstate"], TASKSTATE_PROGRESS) + self.assertEqual(task_meta["message"], "Task is progressing") + + def test_job_extra_after_task_complete(self): + update_job_completed( + task=self.task, + job=self.job, + ) + task_meta = json.loads(self.job.extra) + + self.assertEqual(task_meta["channel"], USER_UID) + self.assertEqual(task_meta["taskname"], "Fake Task") + self.assertEqual(task_meta["progress"], 1.0) + self.assertEqual(task_meta["taskstate"], TASKSTATE_SUCCESS) diff --git a/impresso/tests/models/test_userBitmap.py b/impresso/tests/models/test_userBitmap.py new file mode 100644 index 0000000..6864398 --- /dev/null +++ b/impresso/tests/models/test_userBitmap.py @@ -0,0 +1,183 @@ +from django.conf import settings +from django.test import TestCase +from django.contrib.auth.models import User, Group +from ...models import Profile, UserBitmap, DatasetBitmapPosition +from django.utils import timezone +from ...utils.bitmask import BitMask64, is_access_allowed + + +class UserBitmapTestCase(TestCase): + + def setUp(self): + self.user = User.objects.create_user(username="testuser", password="12345") + self.profile = Profile.objects.create(user=self.user, uid="local-testuser") + self.userBitmap = UserBitmap.objects.create( + user=self.user, + ) + self.groupPlanResearcher = Group.objects.create( + name=settings.IMPRESSO_GROUP_USER_PLAN_RESEARCHER + ) + self.groupPlanEducational = Group.objects.create( + name=settings.IMPRESSO_GROUP_USER_PLAN_EDUCATIONAL + ) + self.test_subscription_domain_A = DatasetBitmapPosition.objects.create( + name="Domain of TEST A archives", + ) + self.test_subscription_domain_B = DatasetBitmapPosition.objects.create( + name="Domain of TEST B archives", + ) + self.test_subscription_domain_C = DatasetBitmapPosition.objects.create( + name="Domain of TEST C archives", + ) + self.test_subscription_domain_D = DatasetBitmapPosition.objects.create( + name="Domain of TEST D archives", + ) + + def test_user_bitmap_lifecycle(self): + self._user_bitmap_guest_to_researcher() + self._user_bitmap_add_remove_subscriptions() + self._user_bitmap_check_access_only_auth_user() + self._user_bitmap_check_access_subscriptions() + + def _user_bitmap_guest_to_researcher(self): + self.assertEqual( + str(self.userBitmap), + f"testuser Bitmap {bin(UserBitmap.USER_PLAN_GUEST)}", + "User has only access to public domain content as the terms have not been accepted yet", + ) + # the user accepts the terms: + self.userBitmap.date_accepted_terms = timezone.now() + self.userBitmap.save() + # get the latest bitmap + # updated_bitmap = self.userBitmap.get_up_to_date_bitmap() + self.assertEqual( + str(self.userBitmap), + f"testuser Bitmap {bin(UserBitmap.USER_PLAN_AUTH_USER)}", + "User has only access to public domain content as the terms have not been accepted yet", + ) + # just add user to the researcher group + self.user.groups.add(self.groupPlanResearcher) + # test update_user_bitmap_on_user_groups_changed signal + self.userBitmap.refresh_from_db() + + self.assertEqual( + self.userBitmap.get_bitmap_as_int(), + UserBitmap.USER_PLAN_RESEARCHER, + "User has access to Researcher content", + ) + # if we change the terms of use, the bitmap should be updated + self.userBitmap.date_accepted_terms = None + self.userBitmap.save() + self.assertEqual( + self.userBitmap.get_bitmap_as_int(), + UserBitmap.USER_PLAN_GUEST, + "User has access to public domain content if the terms changed and have not been accepted", + ) + + def _user_bitmap_add_remove_subscriptions(self): + self.userBitmap.date_accepted_terms = timezone.now() + self.userBitmap.save() + + self.userBitmap.subscriptions.add( + self.test_subscription_domain_B, + ) + # adding a subscription trigger a post_save, let's get it back + self.userBitmap.refresh_from_db() + self.assertEqual( + self.userBitmap.get_bitmap_as_int(), + 0b1001011, + "User researcher has access to subscription TEST B", + ) + + # remove the subscription to B and add the subscription to A + self.userBitmap.subscriptions.remove(self.test_subscription_domain_B) + self.userBitmap.subscriptions.add(self.test_subscription_domain_A) + self.userBitmap.refresh_from_db() + + self.assertEqual( + self.userBitmap.get_bitmap_as_int(), + 0b101011, + "User researcher has access to subscription TEST A", + ) + + self.userBitmap.subscriptions.add( + self.test_subscription_domain_C, + ) + self.userBitmap.refresh_from_db() + self.assertEqual( + [x for x in self.userBitmap.subscriptions.values_list("name", flat=True)], + ["Domain of TEST A archives", "Domain of TEST C archives"], + ) + self.assertEqual( + self.userBitmap.get_bitmap_as_int(), + 0b10101011, + f"User researcher has access to subscription TEST A and TEST C, current:{bin(self.userBitmap.get_bitmap_as_int())}", + ) + + def _user_bitmap_check_access_only_auth_user(self): + user_bitmask = BitMask64(self.userBitmap.bitmap) + content_bitmask = BitMask64(2) + self.assertEqual( + str(content_bitmask), + "0000000000000000000000000000000000000000000000000000000000000010", + "Content 10 is accessible to all authenticated users", + ) + result = is_access_allowed(user_bitmask, content_bitmask) + self.assertTrue(result, "User has access to content 2") + + def _user_bitmap_check_access_subscriptions(self): + # use has now two subscriptions, A and C + self.assertEqual( + [x for x in self.userBitmap.subscriptions.values_list("name", flat=True)], + ["Domain of TEST A archives", "Domain of TEST C archives"], + "User has access to subscription TEST A and TEST C", + ) + + user_bitmask = BitMask64(self.userBitmap.bitmap) + self.assertEqual( + str(user_bitmask), + "0000000000000000000000000000000000000000000000000000000010101011", + "User researcher has access to subscription TEST A and TEST C", + ) + content_bitmask = BitMask64(0b100000100) + self.assertEqual( + str(content_bitmask), + "0000000000000000000000000000000000000000000000000000000100000100", + "Content 0b100000100 is only accessible to students and to a D subscribers", + ) + self.assertFalse( + is_access_allowed(user_bitmask, content_bitmask), + "User does NOT have access to content {content_bitmask}", + ) + # add the correct subscription + self.userBitmap.subscriptions.add(self.test_subscription_domain_D) + self.userBitmap.refresh_from_db() + self.assertTrue( + is_access_allowed(BitMask64(self.userBitmap.bitmap), content_bitmask), + f"User now has finally access to content subscription D! {content_bitmask}", + ) + + content_bitmask = BitMask64(10) + self.assertEqual( + str(content_bitmask), + "0000000000000000000000000000000000000000000000000000000000001010", + ) + + result = is_access_allowed(user_bitmask, content_bitmask) + self.assertTrue(result, "User has still access to content 1010....") + + # clear all subscription! + self.userBitmap.subscriptions.clear() + self.userBitmap.refresh_from_db() + self.assertEqual( + self.userBitmap.get_bitmap_as_int(), + UserBitmap.USER_PLAN_RESEARCHER, + "User researcher subscriptions cleared. They have no more access to datasets...", + ) + self.assertFalse( + is_access_allowed( + accessor=BitMask64(self.userBitmap.bitmap), + content=BitMask64(0b100000100), + ), + "However, user has no more access to content subscription D!", + ) diff --git a/impresso/tests/test_solr.py b/impresso/tests/test_solr.py new file mode 100644 index 0000000..f6236a4 --- /dev/null +++ b/impresso/tests/test_solr.py @@ -0,0 +1,108 @@ +import unittest +from impresso.utils.solr import serialize_solr_doc_content_item_to_plain_dict +from impresso.utils.solr import mapper_doc_redact_contents +from impresso.utils.bitmask import BitMask64 + + +class SolrTestCase(unittest.TestCase): + # def test_JsonWithBitmapDecoder(self): + # # Test the JsonWithBitmapDecoder with a valid input + # original = '{"bm_get_tr_bin": 0b1010}' + # result = json.loads(original, cls=JsonWithBitmapDecoder) + + # self.assertEqual(result, {"bm_get_tr_bin": "0101"}) + def test_serialize_solr_doc_content_item_to_plain_dict(self): + # Test the function with a valid input, a document parsed from solr + result = serialize_solr_doc_content_item_to_plain_dict(FAKE_SOLR_DOC) + + self.assertEqual(result.get("_bm_get_tr_i"), 181) + self.assertEqual(result.get("title"), "Subskription.") + self.assertEqual( + result.get("content"), + "Subskription. Gebet gerne! Wer durch eine Geldspende soziales Schaffen ermöglicht,", + ) + self.assertEqual(result.get("size"), 103) + self.assertEqual(result.get("country"), "LU") + self.assertEqual(result.get("province"), "na") + self.assertEqual(result.get("periodicity"), "na") + self.assertEqual(result.get("year"), 1927) + self.assertEqual(result.get("newspaper"), "johndoe") + self.assertEqual(result.get("issue"), "johndoe-1927-11-15-a") + self.assertEqual(result.get("content_provider"), "BNL") + self.assertEqual(result.get("newspaper_topics"), "Women") + self.assertEqual( + result.get("topics"), + "tm-de-all-v2.0_tp01_de|0.02 tm-de-all-v2.0_tp03_de|0.166 tm-de-all-v2.0_tp11_de|0.026 ", + ) + + def test_mapper_doc_redact_contents(self): + doc = serialize_solr_doc_content_item_to_plain_dict( + { + "id": "johndoe-1927-11-15-a-i0009", + "content_txt_de": "Subskription. Gebet gerne! Wer durch eine Geldspende soziales Schaffen ermöglicht,", + "title_txt_de": "Subskription.", + "meta_year_i": 1927, + "bm_get_tr_i": 181, + } + ) + + # Test the function with a valid input, a document parsed from solr + result_redacted = mapper_doc_redact_contents( + doc={**doc}, + # not working user bitmask key + user_bitmask=BitMask64("0000"), + ) + self.assertEqual(result_redacted.get("content"), "[redacted]") + self.assertEqual(result_redacted.get("title"), doc.get("title")) + + result_ok = mapper_doc_redact_contents( + doc={**doc}, + # working user bitmask key + user_bitmask=BitMask64("1100"), # 0b10110101 + ) + self.assertEqual( + result_ok.get("content"), + "Subskription. Gebet gerne! Wer durch eine Geldspende soziales Schaffen ermöglicht,", + "Content is available: the user has a 1 in the right position, the content item is available", + ) + + +FAKE_SOLR_DOC = { + "id": "johndoe-1927-11-15-a-i0009", + "item_type_s": "ar", + "lg_s": "de", + "title_txt_fr": "Subskription.", + "title_txt_de": None, + "title_txt_en": None, + "content_txt_fr": None, + "content_txt_de": "Subskription. Gebet gerne! Wer durch eine Geldspende soziales Schaffen ermöglicht,", + "content_txt_en": None, + "content_length_i": 103, + "meta_country_code_s": "LU", + "meta_province_code_s": "na", + "meta_periodicity_s": "na", + "meta_year_i": 1927, + "meta_journal_s": "johndoe", + "meta_issue_id_s": "johndoe-1927-11-15-a", + "meta_partnerid_s": "BNL", + "meta_topics_s": "Women", + "meta_polorient_s": "na", + "olr_b": True, + "page_id_ss": ["johndoe-1927-11-15-a-p0010"], + "page_nb_is": [10], + "nb_pages_i": 1, + "front_b": False, + "meta_date_dt": "1927-11-15T00:00:00Z", + "pers_mentions": None, + "loc_mentions": None, + "access_right_s": "OpenPublic", + "score": 1.0, + "exportable_plain": None, + "topics_dpfs": [ + "tm-de-all-v2.0_tp01_de|0.02 tm-de-all-v2.0_tp03_de|0.166 tm-de-all-v2.0_tp11_de|0.026 " + ], + "ucoll_ss": None, + "bm_get_tr_s": None, + "bm_get_tr_bin": None, + "bm_get_tr_i": 181, +} diff --git a/impresso/tests/utils/__init__.py b/impresso/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/impresso/tests/utils/tasks/__init__.py b/impresso/tests/utils/tasks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/impresso/tests/utils/test_bitmask.py b/impresso/tests/utils/test_bitmask.py new file mode 100644 index 0000000..72a6d47 --- /dev/null +++ b/impresso/tests/utils/test_bitmask.py @@ -0,0 +1,50 @@ +import unittest +from impresso.utils.bitmask import BitMask64, is_access_allowed + + +class TestBitMask64(unittest.TestCase): + + def test_init_with_string(self): + self.assertEqual( + str(BitMask64("111101")), + "0000000000000000000000000000000000000000000000000000000000111101", + ) + self.assertEqual( + str(BitMask64("111101", reverse=True)), + "0000000000000000000000000000000000000000000000000000000000101111", + ) + self.assertEqual( + str(BitMask64("10", reverse=True)), + "0000000000000000000000000000000000000000000000000000000000000001", + ) + self.assertEqual( + str(BitMask64("010")), + "0000000000000000000000000000000000000000000000000000000000000010", + ) + + def test_init_with_int(self): + self.assertEqual( + str(BitMask64(0b101111)), + "0000000000000000000000000000000000000000000000000000000000101111", + ) + self.assertEqual( + str(BitMask64(10)), + "0000000000000000000000000000000000000000000000000000000000001010", + ) + + def test_is_access_allowed_edge_cases(self): + accessor_contents_expectedResult = [ + (0b101111, 10, True), + (0b101111, 0b100000000, False), + ] + for accessor, content, expected_result in accessor_contents_expectedResult: + accessor_bitmask = BitMask64(accessor) + content_bitmask = BitMask64(content) + result = is_access_allowed(accessor_bitmask, content_bitmask) + self.assertEqual( + result, + expected_result, + "accessor: {} content: {} expected_result: {} result: {}".format( + accessor, content, expected_result, result + ), + ) diff --git a/impresso/utils/bitmap.py b/impresso/utils/bitmap.py index e64d5e5..74cdf6d 100644 --- a/impresso/utils/bitmap.py +++ b/impresso/utils/bitmap.py @@ -1,38 +1,112 @@ -def check_bitmap_keys_overlap(user_bitmap_key: str, content_bitmap_key: str) -> bool: +from typing import Union + + +def is_access_allowed( + user_permissions_mask: bytes, + content_permissions_mask: bytes, +) -> bool: + """ + Checks if the user has access to the content based on the permissions masks. + + Args: + - content_permissions_mask (bytes): The content permissions mask. + - user_permissions_mask (bytes): The user permissions mask. + + Returns: + - bool: Returns True if the user has access to the content, otherwise False. + + Example Usage: + >>> content_permissions_mask = b"\x01" + >>> user_permissions_mask = b"\x01" + >>> is_access_allowed(content_permissions_mask, user_permissions_mask) + True + """ + max_len = max(len(user_permissions_mask), len(content_permissions_mask)) + user_mask_padded = user_permissions_mask.rjust(max_len, b"\x00") + content_mask_padded = content_permissions_mask.rjust(max_len, b"\x00") + print(f"user_mask_padded: {user_mask_padded}") + print(f"content_mask_padded: {content_mask_padded}") + # Perform bitwise AND on each byte pair to check for overlap + for user_byte, content_byte in zip(user_mask_padded, content_mask_padded): + if user_byte & content_byte: + return True # Found an overlapping permission + + return False # No overlap found + + +def int_to_bytes(n: int) -> bytes: + """ + Converts an integer to a bytes object. + + Args: + - n (int): The integer to be converted. + + Returns: + - bytes: The bytes object. + + Example Usage: + >>> n = 1 + >>> int_to_bytes(n) + b"\x01" """ - Checks if there is any overlap between the user bitmap key and content bitmap key, basically - strings composed of 0 and 1. The keys are mirrored and only then transformed to integers to perform the - bitwise AND operation that finally check if the two keys are compatible. + return n.to_bytes((n.bit_length() + 7) // 8, "big") + + +def check_bitmap_keys_overlap( + user_bitmap_key: str, content_bitmap_key: Union[int, str] +) -> bool: + """ + Checks if there is any overlap between the user bitmap key and content bitmap key, which can be either + strings composed of 0 and 1 or integers. The keys are mirrored and only then transformed to integers to perform the + bitwise AND operation that finally checks if the two keys are compatible. Args: - - user_bitmap (str): The str representation of the user bitmap as 0 and 1 only. - - content_bitmap (str): The str representation of the content bitmap as 0 and 1 only. + - user_bitmap_key (str): The str representation of the user bitmap as 0 and 1 only. + - content_bitmap_key (str or int): The str representation of the content bitmap as 0 and 1 only, or an integer. Returns: - - int: Returns 1 if there is any overlap (i.e., if the bitwise AND result has any `1` bits), otherwise 0. + - bool: Returns True if there is any overlap (i.e., if the bitwise AND result has any `1` bits), otherwise False. Example Usage: >>> user_bitmap_key = "100100" >>> content_bitmap_key = "01000000000000" - >>> check_bitmaps_overlap(user_bitmap_key, content_bitmap_key) - 0 + >>> check_bitmap_keys_overlap(user_bitmap_key, content_bitmap_key) + False + + >>> user_bitmap_key = "100100" + >>> content_bitmap_key = 8192 # equivalent to "01000000000000" + >>> check_bitmap_keys_overlap(user_bitmap_key, content_bitmap_key) + False """ + try: + if isinstance(content_bitmap_key, int): + content_bitmap_long_int = content_bitmap_key + elif isinstance(content_bitmap_key, str): + reversed_content_bitmap_key = content_bitmap_key[::-1] + content_bitmap_long_int = int(reversed_content_bitmap_key, 2) + else: + raise ValueError( + "content_bitmap_key must be either a string of 0 and 1 or an integer" + ) + except ValueError as e: + print( + f"content_bitmap_key must be either a string of 0 and 1 or an integer. Received: {content_bitmap_key}" + ) + print(f"Error: {e}") + return False try: reversed_user_bitmap_key = user_bitmap_key[::-1] - reversed_content_bitmap_key = content_bitmap_key[::-1] - # transform to int - user_bitmap = int(reversed_user_bitmap_key, 2) - content_bitmap = int(reversed_content_bitmap_key, 2) - - # print(f"user reversed original:\n {user_bitmap:05b}") - # print(f"content reversed original:\n {content_bitmap:05b}") - # Perform the bitwise AND to check if there's any overlap - result = user_bitmap & content_bitmap + user_bitmap_long_int = int(reversed_user_bitmap_key, 2) + except ValueError as e: print( - f"user_bitmap_key and content_bitmap_key must be strings of 0 and 1 only, received: user_bitmap_key={user_bitmap_key} and content_bitmap_key={content_bitmap_key}" + f"user_bitmap_key and content_bitmap_key must be strings of 0 and 1 only, or content_bitmap_key can be an integer. Received: user_bitmap_key={user_bitmap_key} and content_bitmap_key={content_bitmap_key}" ) print(f"Error: {e}") return False - # print(f"result:\n {result:05b}") + # Perform the bitwise AND to check if there's any overlap + # print(f"user_bitmap_long_int: {bin(user_bitmap_long_int)}") + # print(f"content_bitmap_long_int: {bin(content_bitmap_long_int)}") + result = user_bitmap_long_int & content_bitmap_long_int + # print(f"result: {bin(result)} {result > 0}") return result > 0 diff --git a/impresso/utils/bitmask.py b/impresso/utils/bitmask.py new file mode 100644 index 0000000..033e452 --- /dev/null +++ b/impresso/utils/bitmask.py @@ -0,0 +1,59 @@ +class BitMask64: + def __init__(self, value: str | int | bytes = 0, reverse: bool = False): + if isinstance(value, str): + if not all(c in "01" for c in value): + raise ValueError("String must contain only '0' and '1'") + if len(value) > 64: + raise ValueError("String must contain maximum 64 characters") + self._value = int(f"{value:064}"[::-1], 2) if reverse else int(value, 2) + elif isinstance(value, int): + self._value = int(f"{value:064b}", 2) if reverse else value + elif isinstance(value, bytes): + if len(value) > 8: + raise ValueError("Bytes must contain maximum 8 bytes") + self._value = int.from_bytes(value, byteorder="big") + else: + raise TypeError( + "Value must be a string of bits or an integer. Type:", type(value) + ) + # Ensure the value is within the 64-bit range and pad if necessary + # self._value &= 0xFFFFFFFFFFFFFFFF + + def __int__(self): + return self._value + + def __str__(self): + return bin(self._value)[2:].zfill(64) + + +def is_access_allowed(accessor: BitMask64, content: BitMask64) -> bool: + """ + Check if access is allowed based on the provided bit masks. + + This function takes two BitMask64 objects, `accessor` and `content`, and + performs a bitwise AND operation to determine if access is allowed. If the + result of the bitwise AND operation is greater than 0, access is allowed. + + Args: + accessor (BitMask64): The bit mask representing the accessor's permissions. + content (BitMask64): The bit mask representing the content's required permissions. + If an integer is provided, it will be reversed. + + Returns: + bool: True if access is allowed, False otherwise. + """ + result = int(accessor) & int(content) + return result > 0 + + +def int_to_bytes(n: int) -> bytes: + """ + Convert an integer to a bytes object. + + Args: + n (int): The integer to convert to bytes. + + Returns: + bytes: The bytes object representing the integer. + """ + return n.to_bytes((n.bit_length() + 7) // 8, "big") diff --git a/impresso/utils/solr.py b/impresso/utils/solr.py new file mode 100644 index 0000000..f189c7b --- /dev/null +++ b/impresso/utils/solr.py @@ -0,0 +1,107 @@ +from typing import Dict, Any +from django.conf import settings +from .bitmask import is_access_allowed, BitMask64 + + +def serialize_solr_doc_content_item_to_plain_dict( + doc: Dict[str, Any], + field_mapping: Dict[str, str] = settings.IMPRESSO_SOLR_FIELDS_TO_ARTICLE_PROPS, +) -> Dict[str, str]: + """ + Convert a Solr document to a content item object as a dictionary. + + Args: + doc: Solr document + field_mapping: Mapping between Solr fields and content item properties + + Returns: + dict: Content item object + """ + result: Dict[str, str] = {} + + for k, v in doc.items(): + prop = field_mapping.get(k, None) + if prop is None: + continue + if isinstance(v, list): + result[prop] = ",".join(str(x) for x in v) + elif not result.get(prop, ""): + result[prop] = v + + return result + + +def mapper_doc_redact_contents(doc: dict, user_bitmask: BitMask64) -> dict: + """ + Redacts the content of a document based on its bitmap key (_bm_get_tr_s) + or its availability and year. + + This function modifies the input document dictionary by redacting its content + if certain conditions are met. Specifically, it checks the "is_content_available" + field and the document's year to determine if the content should be redacted. + + Args: + doc (dict): A dictionary representing the document obtained via the serializer function . + to be considered valid, tt must contain the key "year". + user_bitmask (BitMask64): The user's bitmap key, as BitMask64 instance. + + Returns: + dict: The modified document dictionary with redacted content if applicable. + + Notes: + - If the document's year is greater than or equal to the maximum allowed year + defined in settings.IMPRESSO_CONTENT_DOWNLOAD_MAX_YEAR, the content is redacted. + """ + try: + doc_year = int(doc["year"]) + except KeyError: + print(doc) + raise ValueError("Document does not contain a 'year' field.") + + is_transcript_available = False + + if doc.get("_bm_get_tr_i", None) is not None: + is_transcript_available = is_access_allowed( + accessor=user_bitmask, + content=BitMask64(doc["_bm_get_tr_i"], reverse=True), + ) + elif doc.get("_bm_get_tr_s", None) is not None: + is_transcript_available = is_access_allowed( + accessor=user_bitmask, + # nop need to reverse if this is a string + content=BitMask64(doc["_bm_get_tr_s"]), + ) + elif doc.get("access_right", "") == "OpenPublic": + is_transcript_available = True + # edge cases + elif doc_year < settings.IMPRESSO_CONTENT_DOWNLOAD_MAX_YEAR: + is_transcript_available = True + doc["content"] = "[redacted]" + doc["is_content_available"] = "N" + # doc["is_content_available_notes"] = "year restricted" + if is_transcript_available: + doc["is_content_available"] = "Y" + else: + doc["content"] = "[redacted]" + doc["excerpt"] = "[redacted]" + doc["is_content_available"] = "N" + + return doc + + +def mapper_doc_remove_private_collections(doc: dict, prefix: str) -> dict: + """ + Removes the private collections from the document that do not start with the job creator's ID. + + Args: + doc (dict): The document dictionary containing collections. + prefix (str): The prefix of the collections to keep, actually containing the creator's profile information. + + Returns: + dict: The updated document dictionary with filtered collections. + """ + if "collections" in doc: + # remove collection from the doc if they do not start wirh job creator id + collections = [d for d in doc["collections"].split(",") if d.startswith(prefix)] + doc["collections"] = ",".join(collections) + return doc diff --git a/impresso/utils/tasks/__init__.py b/impresso/utils/tasks/__init__.py index e344926..5201d3b 100644 --- a/impresso/utils/tasks/__init__.py +++ b/impresso/utils/tasks/__init__.py @@ -3,7 +3,7 @@ from typing import Tuple, Any, Dict, Optional from django.conf import settings from ...models import Job -from ..bitmap import check_bitmap_keys_overlap + TASKSTATE_INIT = "INIT" TASKSTATE_PROGRESS = "PROGRESS" @@ -60,7 +60,8 @@ def update_job_progress( logger: Optional[Any] = None, ) -> None: """ - Generic function to update a job. + Generic function to update a job that also specify the `task` message + autoatically get the Impresso Middle Layer. Args: task (Any): The task object. @@ -71,16 +72,50 @@ def update_job_progress( message (str, optional): A message to log. Defaults to "". logger (Optional[Any], optional): Logger instance for logging. Defaults to None. """ - meta = job.get_task_meta(taskname=task.name, progress=progress, extra=extra) + # this is the JSON message that will be stored in REDIS (celery) and + # get from src/selery.ts module in Impresso Middle Layer. + # among the extra: `collection:Dict` and `query:str`. + try: + job_current_extra = json.loads(job.extra) + except json.JSONDecodeError: + job_current_extra = {} + except TypeError: + job_current_extra = {} + # add or update basic task info + job_current_extra.update( + { + "channel": job.creator.profile.uid, + "taskname": task.name, + "taskstate": taskstate, + "progress": progress, + "message": message, + }, + **extra, + ) + # update the job extra field, it is an old TextField + job.extra = json.dumps(job_current_extra) if logger: logger.info( f"[job:{job.pk} user:{job.creator.pk}] " f"type={job.type} status={job.status} taskstate={taskstate} " f"progress={progress * 100:.2f}% - message: '{message}'" ) - job.extra = json.dumps(meta) job.save() - task.update_state(state=taskstate, meta=meta) + task.update_state( + state=taskstate, + meta={ + "job": { + "id": job.pk, + "type": job.type, + "status": job.status, + "date_created": job.date_created.isoformat(), + "date_last_modified": job.date_last_modified.isoformat(), + "creator": job.creator.id, + "description": job.description, + }, + **job_current_extra, + }, + ) def update_job_completed( @@ -141,69 +176,3 @@ def is_task_stopped( extra=extra, ) return True - - -def mapper_doc_redact_contents(doc: dict, user_bitmap_key: str) -> dict: - """ - Redacts the content of a document based on its bitmap key (_bitmap_get_tr) - or its availability and year. - - This function modifies the input document dictionary by redacting its content - if certain conditions are met. Specifically, it checks the "is_content_available" - field and the document's year to determine if the content should be redacted. - - Args: - doc (dict): A dictionary representing the document. It must contain the keys - "year" and optionally "is_content_available". - - Returns: - dict: The modified document dictionary with redacted content if applicable. - - Notes: - - If "is_content_available" is present and not equal to "true", the content - is redacted and "is_content_available" is set to an empty string. - - If "is_content_available" is "true", it is changed to "y". - - If the document's year is greater than or equal to the maximum allowed year - defined in settings.IMPRESSO_CONTENT_DOWNLOAD_MAX_YEAR, the content is redacted. - """ - try: - doc_year = int(doc["year"]) - except KeyError: - print(doc) - raise ValueError("Document does not contain a 'year' field.") - - if doc.get("_bitmap_get_tr", None) is not None: - if not check_bitmap_keys_overlap(user_bitmap_key, doc["_bitmap_get_tr"]): - doc["content"] = "[redacted]" - doc["excerpt"] = "[redacted]" - doc["is_content_available"] = "N" - else: - doc["is_content_available"] = "y" - elif "is_content_available" in doc: - if doc["is_content_available"] != "true": - doc["content"] = "[redacted]" - doc["is_content_available"] = "N" - else: - doc["is_content_available"] = "y" - elif doc_year >= settings.IMPRESSO_CONTENT_DOWNLOAD_MAX_YEAR: - doc["content"] = "[redacted]" - doc["is_content_available"] = "N" - return doc - - -def mapper_doc_remove_private_collections(doc: dict, prefix: str) -> dict: - """ - Removes the private collections from the document that do not start with the job creator's ID. - - Args: - doc (dict): The document dictionary containing collections. - prefix (str): The prefix of the collections to keep, actually containing the creator's profile information. - - Returns: - dict: The updated document dictionary with filtered collections. - """ - if "collections" in doc: - # remove collection from the doc if they do not start wirh job creator id - collections = [d for d in doc["collections"].split(",") if d.startswith(prefix)] - doc["collections"] = ",".join(collections) - return doc diff --git a/impresso/utils/tasks/export.py b/impresso/utils/tasks/export.py index 258c680..30dfc76 100644 --- a/impresso/utils/tasks/export.py +++ b/impresso/utils/tasks/export.py @@ -6,14 +6,15 @@ from typing import Tuple from zipfile import ZipFile, ZIP_DEFLATED from ...models import Job -from ...solr import find_all, solr_doc_to_content_item -from ...utils.tasks import ( - get_pagination, +from ...solr import find_all +from ...utils.tasks import get_pagination +from ...utils.bitmask import BitMask64 +from ...utils.solr import ( mapper_doc_remove_private_collections, mapper_doc_redact_contents, + serialize_solr_doc_content_item_to_plain_dict, ) - default_logger = logging.getLogger(__name__) @@ -44,7 +45,8 @@ def helper_export_query_as_csv_progress( job: Job, query: str, query_hash: str, - user_bitmap_key: str, + user_bitmap_key: int, + ignore_fields: list = [], skip: int = 0, limit: int = 100, logger: logging.Logger = default_logger, @@ -58,6 +60,8 @@ def helper_export_query_as_csv_progress( Args: job (Job): The job object containing user profile information. query (str): The SOLR query string. + query_hash (str): The hash of the query string. + user_bitmap_key (int): The user's bitmap key. skip (int, optional): The number of items to skip. Defaults to 0. limit (int, optional): The maximum number of items per page. Defaults to 0. logger (Any, optional): The logger object. Defaults to None. @@ -67,9 +71,11 @@ def helper_export_query_as_csv_progress( - loops (int): The number of loops allowed. - progress (float): The progress percentage. """ - contents = find_all( - q=query, fl=settings.IMPRESSO_SOLR_FIELDS, skip=skip, logger=logger - ) + # remove fields to speed up the process + query_param_fl = [ + field for field in settings.IMPRESSO_SOLR_FIELDS if field not in ignore_fields + ] + contents = find_all(q=query, fl=query_param_fl, skip=skip, logger=logger) total = contents["response"]["numFound"] qtime = contents["responseHeader"]["QTime"] # generate extra from job stats @@ -90,7 +96,7 @@ def helper_export_query_as_csv_progress( loops, progress, ) - + user_bitmask = BitMask64(user_bitmap_key) logger.info( f"[job:{job.pk} user:{job.creator.pk}] Opening file in APPEND mode:" f"{job.attachment.upload.path}" @@ -100,7 +106,7 @@ def helper_export_query_as_csv_progress( fieldnames = [ field for field in settings.IMPRESSO_SOLR_ARTICLE_PROPS - if not field.startswith("_") + if not field.startswith("_") and field not in ignore_fields ] # Sort fieldnames with 'uid' first, then the rest alphabetically with open( @@ -117,7 +123,6 @@ def helper_export_query_as_csv_progress( logger.info( f"[job:{job.pk} user:{job.creator.pk}] writing header: {fieldnames}" ) - w.writeheader() # write custom header w.writerow({fieldnames[0]: get_results_message(total, max_loops, limit)}) w.writerow( @@ -134,6 +139,7 @@ def helper_export_query_as_csv_progress( ) # empty line w.writerow({}) + w.writeheader() # filter out docs without proper metadata. We will warn about them in a moment rows = [ @@ -149,13 +155,13 @@ def helper_export_query_as_csv_progress( ) for row in rows: - content_item = solr_doc_to_content_item(row) + content_item = serialize_solr_doc_content_item_to_plain_dict(row) content_item = mapper_doc_remove_private_collections( doc=content_item, prefix=job.creator.profile.uid ) content_item = mapper_doc_redact_contents( doc=content_item, - user_bitmap_key=user_bitmap_key, + user_bitmask=user_bitmask, ) # removed unwanted fields from the content_item content_item = {k: v for k, v in content_item.items() if k in fieldnames}