From 19b8297efb9ba496fcf32ca54fd52f4c3f471f83 Mon Sep 17 00:00:00 2001 From: Mateus Vieira <68292695+mateusvrs@users.noreply.github.com> Date: Fri, 29 Dec 2023 15:21:38 -0300 Subject: [PATCH] =?UTF-8?q?task(api):=20otimiza=20piores=20e=20m=C3=A9dios?= =?UTF-8?q?=20casos=20da=20atualiza=C3=A7=C3=A3o=20do=20db=20por=20meio=20?= =?UTF-8?q?de=20redis=20cache,=20pages=20fingerprints=20e=20multithreading?= =?UTF-8?q?=20(#183)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * devops(python): django-redis lib and to container * core(settings): config redis cache with env vars * utils(scraper): add web page fingerprinter * utils(commands): separate mult replace into func * api(commands): add cache verification to update db * utils(commands): move mult replace from file * utils(tests): check cache access of finge * api(commands): add multithreading to db update * typo(commands): change 'bando' to 'banco' * core(settings): set up global time constants * api(decorators): wrapper to del cache keys * api(commands): move cache set to take effect * api(models): add cache handle to unique delete * api(tests): add check to models delete --- api/.env.example | 3 + api/api/decorators.py | 21 +++ api/api/management/commands/updatedb.py | 123 ++++++++++++++---- api/api/models.py | 58 ++++++++- ...st_discipline_models.py => test_models.py} | 38 +++++- api/core/settings/base.py | 24 ++++ api/requirements.txt | 2 + api/utils/db_handler.py | 12 +- api/utils/functions.py | 14 ++ api/utils/management/commands/updatemock.py | 13 +- api/utils/tests/test_web_scraping.py | 53 +++++++- api/utils/web_scraping.py | 49 ++++--- docker-compose.yml | 9 +- 13 files changed, 351 insertions(+), 68 deletions(-) create mode 100644 api/api/decorators.py rename api/api/tests/{test_discipline_models.py => test_models.py} (67%) create mode 100644 api/utils/functions.py diff --git a/api/.env.example b/api/.env.example index 430ea204..86399b4c 100644 --- a/api/.env.example +++ b/api/.env.example @@ -17,6 +17,9 @@ POSTGRES_DB="postgres" POSTGRES_USER="suagradeunb" POSTGRES_PASSWORD="suagradeunb" +# Redis +REDIS_CACHE_LOCATION="redis://redis:6379/1" + # Credenciais de acesso ao admin ADMIN_NAME="admin" ADMIN_PASS="admin" diff --git a/api/api/decorators.py b/api/api/decorators.py new file mode 100644 index 00000000..2206f95e --- /dev/null +++ b/api/api/decorators.py @@ -0,0 +1,21 @@ +from django.core.cache import cache +from api.models import cache_error_msg +import functools + + +def handle_cache_before_delete(query_func: callable) -> callable: + + @functools.wraps(query_func) + def wrapper(*args, **kwargs): + queryset = query_func(*args, **kwargs) + + try: + for query in queryset: + cache_key = query.get_cache_key() + cache.delete(cache_key) + except: # pragma: no cover + raise ValueError(cache_error_msg) + else: + queryset.delete() + + return wrapper diff --git a/api/api/management/commands/updatedb.py b/api/api/management/commands/updatedb.py index 72368c82..5a5377fc 100644 --- a/api/api/management/commands/updatedb.py +++ b/api/api/management/commands/updatedb.py @@ -1,9 +1,15 @@ from typing import Any from argparse import ArgumentParser as CommandParser from django.core.management.base import BaseCommand -from utils import sessions, web_scraping -from utils.db_handler import delete_classes_from_discipline, delete_all_departments_using_year_and_period, get_or_create_department, get_or_create_discipline, create_class -from time import time +from django.db import transaction +from utils import sessions +from utils import db_handler as dbh +from utils.web_scraping import DisciplineWebScraper, get_list_of_departments +from django.core.cache import cache +from time import time, sleep +from collections import deque +from core.settings.base import THIRTY_DAYS_IN_SECS +import threading class Command(BaseCommand): @@ -13,9 +19,12 @@ class Command(BaseCommand): def add_arguments(self, parser: CommandParser) -> None: """Adiciona os argumentos do comando.""" - parser.add_argument('-a', '-all', action='store_true', dest='all', default=False, + parser.add_argument('-a', '--all', action='store_true', dest='all', default=False, help="Atualiza o banco de dados com as disciplinas dos períodos atual e seguinte.") + parser.add_argument('-ds', '--descriptive', action='store_true', default=False, + help="Ativa a opção de uma atualização descritiva com os outputs (print) necessários") + parser.add_argument('-p', '--period', action='store', default=None, choices=[".".join(sessions.get_current_year_and_period()), ".".join( sessions.get_next_period())], @@ -26,6 +35,7 @@ def add_arguments(self, parser: CommandParser) -> None: def handle(self, *args: Any, **options: Any): choices = [] + threads = [] if options["all"]: choices.append(sessions.get_current_year_and_period()) @@ -37,19 +47,27 @@ def handle(self, *args: Any, **options: Any): print("Utilize o comando 'updatedb -h' para mais informações.") return - # Obtem o ano e o período anterior ao período atual + # Obtém o ano e o período anterior ao período atual previous_period_year, previous_period = sessions.get_previous_period() # Apaga as disciplinas do período anterior - delete_all_departments_using_year_and_period( + dbh.delete_all_departments_using_year_and_period( year=previous_period_year, period=previous_period) if options["delete"]: for year, period in choices: - self.delete_period(year=year, period=period) + thread = threading.Thread( + target=self.delete_period, args=(year, period,)) + threads.append(thread) + thread.start() + + for thread in threads: + thread.join() + threads.clear() + return - departments_ids = web_scraping.get_list_of_departments() + departments_ids = get_list_of_departments() if departments_ids is None: self.display_error_message("department_ids") @@ -57,47 +75,102 @@ def handle(self, *args: Any, **options: Any): print("Atualizando o banco de dados...") - for year, period in choices: + def start_update_year_period(year: str, period: str): try: start_time = time() - self.update_departments( - departments_ids=departments_ids, year=year, period=period) + print(f"Começando atualização de {year}/{period}") + with transaction.atomic(): + self.update_departments( + departments_ids, year, period, options) + self.display_success_update_message( operation=f"{year}/{period}", start_time=start_time) except Exception as exception: - print("Houve um erro na atualização do bando de dados.") + print("Houve um erro na atualização do banco de dados.") print(f"Error: {exception}") - def update_departments(self, departments_ids: list, year: str, period: str) -> None: + start_tot_time = time() + for year, period in choices: + thread = threading.Thread( + target=start_update_year_period, args=(year, period,)) + threads.append(thread) + thread.start() + sleep(0.01) # little time to start print don't overleap + + print() + + for thread in threads: + thread.join() + threads.clear() + + print(f"\nTempo total de execução: {(time() - start_tot_time):.1f}s") + + def update_departments(self, departments_ids: list, year: str, period: str, options: Any) -> None: """Atualiza os departamentos do banco de dados e suas respectivas disciplinas.""" - for department_id in departments_ids: - print(f"WebScraping do departamento: {department_id}") - disciplines_list = web_scraping.get_department_disciplines( - department_id=department_id, current_year=year, current_period=period) - department = get_or_create_department( + def execute_update(department_id): + scraper = DisciplineWebScraper(department_id, year, period) + fingerprint = scraper.create_page_fingerprint() + + cache_key = f"{department_id}/{year}.{period}" + try: + cache_value = cache.get(cache_key) + if cache_value and cache_value == fingerprint: + if options['descriptive']: + print(f"Departamento ({department_id}) atualizado, operação não necessária") + return + except: + print("Ocorreu um erro ao tentar acessar o cache") + pass + + disciplines_list = scraper.get_disciplines() + department = dbh.get_or_create_department( code=department_id, year=year, period=period) - print("Atualizando disciplinas do departamento...") + if options['descriptive']: + print(f"Departamento ({department_id}) desatualizado, operação necessária") + # Para cada disciplina do período atual, deleta as turmas previamente cadastradas e cadastra novas turmas no banco de dados for discipline_code in disciplines_list: classes_info = disciplines_list[discipline_code] # Cria ou pega a disciplina - discipline = get_or_create_discipline( + discipline = dbh.get_or_create_discipline( name=classes_info[0]["name"], code=discipline_code, department=department) # Deleta as turmas previamente cadastradas - delete_classes_from_discipline(discipline=discipline) + dbh.delete_classes_from_discipline(discipline=discipline) # Cadastra as novas turmas for class_info in classes_info: - create_class(teachers=class_info["teachers"], - classroom=class_info["classroom"], schedule=class_info["schedule"], - days=class_info["days"], _class=class_info["class_code"], discipline=discipline, special_dates=class_info["special_dates"]) + dbh.create_class(teachers=class_info["teachers"], + classroom=class_info["classroom"], schedule=class_info["schedule"], + days=class_info["days"], _class=class_info["class_code"], discipline=discipline, special_dates=class_info["special_dates"]) + + cache.set(cache_key, fingerprint, timeout=THIRTY_DAYS_IN_SECS) + + if options['descriptive']: + print(f'Operação de atualização finalizada para o departamento ({department_id})') + + threads = deque() + for department_id in departments_ids: + thread = threading.Thread( + target=execute_update, args=(department_id,)) + threads.append(thread) + thread.start() + + if len(threads) == 3: + threads[0].join() + threads.popleft() + + for thread in threads: + thread.join() + threads.clear() def delete_period(self, year: str, period: str) -> None: """Deleta um período do banco de dados.""" start_time = time() - delete_all_departments_using_year_and_period(year=year, period=period) + with transaction.atomic(): + dbh.delete_all_departments_using_year_and_period( + year=year, period=period) self.display_success_delete_message( operation=f"{year}/{period}", start_time=start_time) diff --git a/api/api/models.py b/api/api/models.py index 4a891cb6..e3cf8235 100644 --- a/api/api/models.py +++ b/api/api/models.py @@ -3,8 +3,27 @@ from django.contrib.postgres.fields import ArrayField from users.models import User from django.utils import timezone +from django.core.cache import cache -class Department(models.Model): +cache_error_msg = "Cache isn't working properly, so database isn't allowed to be modified!" + + +class CustomModel(models.Model): + class Meta: + abstract = True + + def delete(self, *args, **kwargs): + try: + cache.delete(kwargs['cache_key']) + kwargs.pop('cache_key') + except: # pragma: no cover + raise ValueError(cache_error_msg) + else: + super(CustomModel, self).delete() + pass + + +class Department(CustomModel): """Classe que representa um departamento. code:str -> Código do departamento year:str -> Ano do departamento @@ -17,8 +36,19 @@ class Department(models.Model): def __str__(self): return self.code + def get_cache_key(self): + code = self.code + year = self.year + period = self.period + + return f"{code}/{year}.{period}" + + def delete(self, *args, **kwargs): + kwargs['cache_key'] = self.get_cache_key() + super(Department, self).delete(*args, **kwargs) -class Discipline(models.Model): + +class Discipline(CustomModel): """Classe que representa uma disciplina. name:str -> Nome da disciplina unicode_name:str -> Nome da disciplina normalizado @@ -38,8 +68,19 @@ def save(self, *args, **kwargs): self.unicode_name = unidecode(self.name).casefold() super(Discipline, self).save(*args, **kwargs) + def get_cache_key(self): + code = self.department.code + year = self.department.year + period = self.department.period + + return f"{code}/{year}.{period}" + + def delete(self, *args, **kwargs): + kwargs['cache_key'] = self.get_cache_key() + super(Discipline, self).delete(*args, **kwargs) + -class Class(models.Model): +class Class(CustomModel): """Classe que representa uma turma. teachers:list -> Lista de professores da turma classroom:str -> Sala da turma @@ -67,6 +108,17 @@ class Class(models.Model): def __str__(self): return self._class + def get_cache_key(self): + code = self.discipline.department.code + year = self.discipline.department.year + period = self.discipline.department.period + + return f"{code}/{year}.{period}" + + def delete(self, *args, **kwargs): + kwargs['cache_key'] = self.get_cache_key() + super(Class, self).delete(*args, **kwargs) + class Schedule(models.Model): """Classe que representa um horário. diff --git a/api/api/tests/test_discipline_models.py b/api/api/tests/test_models.py similarity index 67% rename from api/api/tests/test_discipline_models.py rename to api/api/tests/test_models.py index c59a7e23..408c9fce 100644 --- a/api/api/tests/test_discipline_models.py +++ b/api/api/tests/test_models.py @@ -1,9 +1,10 @@ from django.test import TestCase +from django.core.cache import cache from api.models import Department, Discipline, Class -class DisciplineModelsTest(TestCase): - def setUp(self): +class ModelsTest(TestCase): + def create_data(self): self.department = Department.objects.create( code='INF', year="2023", @@ -24,6 +25,11 @@ def setUp(self): discipline=self.discipline ) + cache.set("INF/2023.2", "hash_value") + + def setUp(self): + self.create_data() + def test_create_discipline(self): self.assertEqual(self.discipline.name, 'Métodos de Desenvolvimento de Software') @@ -52,3 +58,31 @@ def test_str_method_of_class(self): def test_str_method_of_department(self): self.assertEqual(str(self.department), self.department.code) + + def test_delete_department_with_cache_handle(self): + self.department.delete() + + empty_model = not len(Department.objects.all()) + empty_cache = not len(cache.keys('*')) + + self.assertTrue(empty_model) + self.assertTrue(empty_cache) + + def test_delete_discipline_with_cache_handle(self): + self.discipline.delete() + + empty_model = not len(Discipline.objects.all()) + empty_cache = not len(cache.keys('*')) + + self.assertTrue(empty_model) + self.assertTrue(empty_cache) + + def test_delete_class_with_cache_handle(self): + self._class.delete() + + empty_model = not len(Class.objects.all()) + empty_cache = not len(cache.keys('*')) + + self.assertTrue(empty_model) + self.assertTrue(empty_cache) + \ No newline at end of file diff --git a/api/core/settings/base.py b/api/core/settings/base.py index 6472e170..282ec66a 100644 --- a/api/core/settings/base.py +++ b/api/core/settings/base.py @@ -47,6 +47,30 @@ 'rest_framework_simplejwt.token_blacklist' ] +# Time constants + +HOUR_IN_SECS = 60 * 60 +HALF_DAY_IN_SECS = 12 * HOUR_IN_SECS +DAY_IN_SECS = 24 * HOUR_IN_SECS +THIRTY_DAYS_IN_SECS = 30 * DAY_IN_SECS + +# Cache +# https://docs.djangoproject.com/en/5.0/topics/cache/ + +CACHES = { + "default": { + "BACKEND": "django_redis.cache.RedisCache", + "LOCATION": config("REDIS_CACHE_LOCATION"), + "OPTIONS": { + "CLIENT_CLASS": "django_redis.client.DefaultClient", + }, + "TIMEOUT": HALF_DAY_IN_SECS + } +} + +SESSION_ENGINE = "django.contrib.sessions.backends.cache" +SESSION_CACHE_ALIAS = "default" + # Authentication AUTH_USER_MODEL = 'users.User' diff --git a/api/requirements.txt b/api/requirements.txt index 93e83cf5..43e557a6 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -15,6 +15,7 @@ defusedxml==0.7.1 dj-database-url==2.1.0 Django==4.2.7 django-cors-headers==4.3.0 +django-redis==5.4.0 djangorestframework==3.14.0 djangorestframework-simplejwt==5.3.0 drf-yasg==1.21.7 @@ -51,6 +52,7 @@ python3-openid==3.2.0 pytz==2023.3.post1 PyYAML==6.0.1 pyyaml_env_tag==0.1 +redis==5.0.1 regex==2023.8.8 requests==2.31.0 rsa==4.9 diff --git a/api/utils/db_handler.py b/api/utils/db_handler.py index 36ffde03..08e4cb7e 100644 --- a/api/utils/db_handler.py +++ b/api/utils/db_handler.py @@ -1,6 +1,7 @@ from api.models import Discipline, Department, Class from api.serializers import ClassSerializerSchedule from api.models import Schedule +from api.decorators import handle_cache_before_delete from users.models import User @@ -30,15 +31,16 @@ def create_class(teachers: list, classroom: str, schedule: str, return Class.objects.create(teachers=teachers, classroom=classroom, schedule=schedule, days=days, _class=_class, special_dates=special_dates, discipline=discipline) - -def delete_classes_from_discipline(discipline: Discipline) -> None: +@handle_cache_before_delete +def delete_classes_from_discipline(discipline: Discipline) -> QuerySet: """Deleta todas as turmas de uma disciplina.""" - Class.objects.filter(discipline=discipline).delete() + return Class.objects.filter(discipline=discipline) -def delete_all_departments_using_year_and_period(year: str, period: str) -> None: +@handle_cache_before_delete +def delete_all_departments_using_year_and_period(year: str, period: str) -> QuerySet: """Deleta um departamento de um periodo especifico.""" - Department.objects.filter(year=year, period=period).delete() + return Department.objects.filter(year=year, period=period) def get_best_similarities_by_name(name: str, disciplines: Discipline = Discipline.objects, config="portuguese_unaccent") -> QuerySet: diff --git a/api/utils/functions.py b/api/utils/functions.py new file mode 100644 index 00000000..5f53e45a --- /dev/null +++ b/api/utils/functions.py @@ -0,0 +1,14 @@ +import re + + +def multiple_replace(text, replacement=None): + replacement_dict = replacement + if not replacement: # pragma: no cover + replacement_dict = { + '\n': '', + '\t': '', + '\r': '', + } + + pattern = re.compile('|'.join(map(re.escape, replacement_dict.keys()))) + return pattern.sub(lambda match: replacement_dict[match.group(0)], text) diff --git a/api/utils/management/commands/updatemock.py b/api/utils/management/commands/updatemock.py index 7049b2c8..04db043c 100644 --- a/api/utils/management/commands/updatemock.py +++ b/api/utils/management/commands/updatemock.py @@ -4,7 +4,7 @@ from utils import sessions as sns, web_scraping as wbp from django.core.management.base import BaseCommand from pathlib import Path -import re +from utils.functions import multiple_replace import json import os @@ -33,7 +33,7 @@ def handle(self, *args: Any, **options: Any): department, current_year, current_period) response = discipline_scraper.get_response_from_disciplines_post_request() - striped_response = self.multiple_replace( + striped_response = multiple_replace( self.response_decode(response)) mock_file.write(striped_response) @@ -49,15 +49,6 @@ def handle(self, *args: Any, **options: Any): print('Não foi possível atualizar o mock!') print('Error:', error) - def multiple_replace(self, text): - replacement_dict = { - '\n': '', - '\t': '', - '\r': '', - } - pattern = re.compile('|'.join(map(re.escape, replacement_dict.keys()))) - return pattern.sub(lambda match: replacement_dict[match.group(0)], text) - def response_decode(self, response: Response) -> str: encoding = response.encoding if response.encoding else 'utf-8' return response.content.decode(encoding) diff --git a/api/utils/tests/test_web_scraping.py b/api/utils/tests/test_web_scraping.py index 00edeb85..c9cf46bb 100644 --- a/api/utils/tests/test_web_scraping.py +++ b/api/utils/tests/test_web_scraping.py @@ -4,11 +4,18 @@ from utils import web_scraping as wbp from django.urls import reverse from pathlib import Path +from django.core.cache import cache import random import json + class WebScrapingTest(APITestCase): + def setUp(self): + # Clean cache + for key in cache.keys("*"): + cache.delete(key) + def cookie(self): cookie = "" for _ in range(32): @@ -16,7 +23,7 @@ def cookie(self): return cookie - def make_disciplines_request(self, path_name: str): + def generate_args(self, path_name: str): current_path = Path(__file__).parents[1].absolute() infos_path = current_path / f"mock/infos.json" @@ -29,12 +36,27 @@ def make_disciplines_request(self, path_name: str): url = reverse(f'utils:sigaa', kwargs={"path": path_name}) args = [department, year, period, url, self.client, self.cookie()] - disciplines = wbp.get_department_disciplines(*args) + + return args + + def make_disciplines_request(self, path_name: str): + args = self.generate_args(path_name) + + scraper = wbp.DisciplineWebScraper(*args) + disciplines = scraper.get_disciplines() return disciplines + def create_fingerprint(self, path_name: str): + args = self.generate_args(path_name) + + scraper = wbp.DisciplineWebScraper(*args) + + return scraper.create_page_fingerprint() + def test_get_list_of_departments(self): - response = self.client.get(reverse('utils:sigaa', kwargs={"path": "sigaa"})) + response = self.client.get( + reverse('utils:sigaa', kwargs={"path": "sigaa"})) departments = wbp.get_list_of_departments(response) self.assertEqual(type(list()), type(departments)) @@ -42,7 +64,8 @@ def test_get_list_of_departments(self): self.assertEqual(type(str()), type(departments[0])) def test_get_list_of_departments_when_empty(self): - response = self.client.get(reverse('utils:sigaa', kwargs={"path": "empty"})) + response = self.client.get( + reverse('utils:sigaa', kwargs={"path": "empty"})) departments = wbp.get_list_of_departments(response) self.assertIsNone(departments) @@ -70,4 +93,24 @@ def test_get_department_disciplines_when_without_tr_html_tag(self): disciplines = self.make_disciplines_request('table') self.assertFalse(len(disciplines)) - \ No newline at end of file + + def test_do_not_find_nonexisting_fingerprint(self): + cache_value = cache.get('0000/2023.1') + + self.assertEqual(cache_value, None) + + def test_find_existing_fingerprint(self): + fingerprint = self.create_fingerprint('sigaa') + + key = '0000/2023.1' + cache.set(key, fingerprint) + + self.assertEqual(cache.get(key), fingerprint) + + def test_find_existing_fingerprint_from_empty(self): + fingerprint = self.create_fingerprint('empty') + + key = '0001/2023.1' + cache.set(key, fingerprint) + + self.assertEqual(cache.get(key), 'not_content') diff --git a/api/utils/web_scraping.py b/api/utils/web_scraping.py index 3e5cb255..e6db21e1 100644 --- a/api/utils/web_scraping.py +++ b/api/utils/web_scraping.py @@ -3,8 +3,10 @@ from collections import defaultdict from typing import List, Optional, Iterator from re import findall, finditer +from utils.functions import multiple_replace import requests.utils import requests +import hashlib ''' Modo de uso: @@ -46,15 +48,6 @@ def get_list_of_departments(response=get_response(create_request_session())) -> return department_ids -def get_department_disciplines(department_id: str, current_year: str, current_period: str, url=URL, session=None, cookie=None) -> defaultdict[str, List[dict]]: - """Obtem as disciplinas de um departamento""" - discipline_scraper = DisciplineWebScraper( - department_id, current_year, current_period, url, session, cookie) - disciplines = discipline_scraper.get_disciplines() - - return disciplines - - class DisciplineWebScraper: # Classe que faz o web scraping das disciplinas def __init__(self, department: str, year: str, period: str, url=URL, session=None, cookie=None): @@ -84,17 +77,17 @@ def __init__(self, department: str, year: str, period: str, url=URL, session=Non else: self.cookie = cookie + self.response = None + def get_response_from_disciplines_post_request(self) -> requests.Response: # Faz uma requisição POST para obter a resposta das turmas disponíveis - response = self.session.post( + self.response = self.session.post( self.url, headers=HEADERS, cookies=self.cookie, data=self.data ) - return response - def get_teachers(self, data: list) -> list: teachers = [] @@ -240,7 +233,7 @@ def make_disciplines(self, rows: str) -> None: "days": days }) - def make_web_scraping_of_disciplines(self, response) -> None: + def retrieve_classes_tables(self, response): # Faz o web scraping das disciplinas soup = BeautifulSoup(response.content, "html.parser") # Find the