diff --git a/article/migrations/0014_alter_sourcearticle_doi.py b/article/migrations/0014_alter_sourcearticle_doi.py new file mode 100644 index 00000000..043640fa --- /dev/null +++ b/article/migrations/0014_alter_sourcearticle_doi.py @@ -0,0 +1,18 @@ +# Generated by Django 4.1.6 on 2023-07-25 17:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("article", "0013_alter_article_sources_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="sourcearticle", + name="doi", + field=models.CharField(max_length=255, null=True, verbose_name="DOI"), + ), + ] diff --git a/article/models.py b/article/models.py index ba5045c3..f42ced24 100644 --- a/article/models.py +++ b/article/models.py @@ -734,10 +734,10 @@ class SourceArticle(models.Model): _("Specific Id"), max_length=255, null=False, blank=False ) year = models.CharField(_("Year"), max_length=10, null=True, blank=True) + doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False) is_paratext = models.BooleanField( _("Paratext"), default=False, null=True, blank=True ) - doi = models.CharField(_("DOI"), max_length=100, null=True, blank=False) updated = models.CharField( _("Source updated date"), max_length=50, null=True, blank=False ) @@ -813,13 +813,13 @@ def get(cls, **kwargs): filters = {} - if not kwargs.get("doi") and not kwargs.get("specific_id"): + if not kwargs.get("doi") and not kwargs.get("specific_id") and not kwargs.get("source"): raise ValueError("Param doi or specific_id is required") if kwargs.get("doi"): - filters = {"doi": kwargs.get("doi")} + filters = {"doi": kwargs.get("doi"), 'source': kwargs.get("source")} elif kwargs.get("specific_id"): - filters = {"specific_id": kwargs.get("specific_id")} + filters = {"specific_id": kwargs.get("specific_id"), 'source': kwargs.get("source")} return cls.objects.get(**filters) diff --git a/article/scripts/concat_sucupira.py b/article/scripts/concat_sucupira.py new file mode 100644 index 00000000..c7677d09 --- /dev/null +++ b/article/scripts/concat_sucupira.py @@ -0,0 +1,25 @@ +import os +from django.utils.translation import gettext as _ + +from article.tasks import concat_article_sucupira_detail, concat_author_sucupira + + +def run(production_file_csv, detail_file_csv, authors=None, sync=0, file_name="sucupira_article.csv"): + """ + Concate the a file with the article production in CAPES + """ + sync = bool(int(sync)) + authors = authors.split(",") + + if production_file_csv and detail_file_csv: + if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv): + if sync: + df = concat_article_sucupira_detail(production_file_csv, detail_file_csv) + + if authors: + ddfau = concat_author_sucupira(df, authors) + ddfau.to_csv(file_name, index=False) + else: + concat_article_sucupira_detail.apply_async(args=(production_file_csv, detail_file_csv, True)) + else: + print(_("It looks like the given path is not a file!")) \ No newline at end of file diff --git a/article/scripts/load_sucupira.py b/article/scripts/load_sucupira.py new file mode 100644 index 00000000..fa31d56c --- /dev/null +++ b/article/scripts/load_sucupira.py @@ -0,0 +1,21 @@ +import os +from django.utils.translation import gettext as _ + +from article.tasks import load_sucupira + + +def run(production_file_csv, detail_file_csv, authors=None, sync=0): + """ + Load the sucupira data to article.models.SourceArticle + """ + sync = bool(int(sync)) + authors = authors.split(",") + + if production_file_csv and detail_file_csv: + if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv): + if sync: + load_sucupira(production_file_csv, detail_file_csv, authors) + else: + load_sucupira.apply_async(args=(production_file_csv, detail_file_csv, authors)) + else: + print(_("It looks like the given path is not a file!")) diff --git a/article/tasks.py b/article/tasks.py index 35be5c60..a4a74248 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -1,12 +1,13 @@ import logging +import pandas as pd from django.conf import settings from django.contrib.auth import get_user_model from django.utils.translation import gettext as _ from article import models -from core.models import Source from config import celery_app +from core.models import Source from core.utils import utils as core_utils logger = logging.getLogger(__name__) @@ -33,7 +34,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): tasks.load_openalex(date=2012) - Running using a script: + Running using a script: python manage.py runscript load_openalex --script-args 1 2012 @@ -148,7 +149,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): "Department of Exercise Epidemiology, Centre for Research in Childhood Health, University of Southern Denmark, Odense, Denmark" ] } - } + } """ url = ( settings.URL_API_OPENALEX @@ -158,7 +159,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): _source, _ = Source.objects.get_or_create(name="OPENALEX") try: - flag = True + flag = True article_count = 0 while flag: @@ -176,16 +177,18 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): article["source"] = _source article["raw"] = item - article, is_created = models.SourceArticle.create_or_update(**article) + article, created = models.SourceArticle.create_or_update( + **article + ) logger.info( "%s: %s" % ( - "Created article" if is_created else "Updated article", + "Created article" if created else "Updated article", article, ) ) - article_count += 1 + article_count += 1 cursor = payload["meta"]["next_cursor"] @@ -219,7 +222,7 @@ def contributors(authors): """ This function generate a list os contributors list. - This function get the key ``authorships`` from with this struture: + This function get the key ``authorships`` from with this struture: "authorships":[ { @@ -303,18 +306,19 @@ def contributors(authors): affs = [] for aff in au.get("raw_affiliation_strings"): - aff_obj, _ = models.Affiliation.create_or_update( **{"name": aff} ) affs.append(aff_obj) author_dict.update( - {'affiliations': affs, 'affiliations_string': au.get("raw_affiliation_string")}) + { + "affiliations": affs, + "affiliations_string": au.get("raw_affiliation_string"), + } + ) - contributor, _ = models.Contributor.create_or_update( - **author_dict - ) + contributor, _ = models.Contributor.create_or_update(**author_dict) contributors.append(contributor) @@ -323,7 +327,6 @@ def contributors(authors): # read SourceArticle for article in models.SourceArticle.objects.filter(source__name="OPENALEX"): try: - doi = article.doi # title title = core_utils.nestget(article.raw, "title") @@ -346,7 +349,9 @@ def contributors(authors): # Get the journal data if article.raw.get("primary_location"): - journal_data = core_utils.nestget(article.raw, "primary_location", "source") + journal_data = core_utils.nestget( + article.raw, "primary_location", "source" + ) if journal_data: j_issn_l = journal_data.get("issn_l") if journal_data.get("issn"): @@ -408,3 +413,152 @@ def contributors(authors): logger.info("Article: %s, %s" % (article, created)) except Exception as e: logger.error("Erro on save article: %s" % e) + + +@celery_app.task( + name="Concatenates the Sucupira intellectual production with the details of the production" +) +def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False): + """ + This task concate the a file with the article production in CAPES. + + The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu + + The columns of the production_file_csv: + + ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO', + 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL', + 'ID_PRODUCAO_INTELECTUAL', 'NM_PRODUCAO', 'ID_TIPO_PRODUCAO', + 'NM_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'NM_SUBTIPO_PRODUCAO', + 'ID_FORMULARIO_PRODUCAO', 'NM_FORMULARIO', 'ID_AREA_CONCENTRACAO', + 'NM_AREA_CONCENTRACAO', 'ID_LINHA_PESQUISA', 'NM_LINHA_PESQUISA', + 'ID_PROJETO', 'NM_PROJETO', 'DH_INICIO_AREA_CONC', 'DH_FIM_AREA_CONC', + 'DH_INICIO_LINHA', 'DH_FIM_LINHA', 'IN_GLOSA', + 'IN_PRODUCAO_COM_VINCULO_TCC', 'ID_ADD_TRABALHO_CONCLUSAO_CT'], + + The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf + + The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao + + The columns of the detail_file_csv: + ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO', + 'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL', + 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME', + 'DS_FASCICULO', 'NR_SERIE', 'NR_PAGINA_FINAL', 'NR_PAGINA_INICIAL', + 'DS_IDIOMA', 'DS_DIVULGACAO', 'DS_URL', 'DS_OBSERVACOES', 'NM_EDITORA', + 'NM_CIDADE', 'DS_DOI', 'DS_ISSN', 'ID_VALOR_LISTA', 'DS_URL_DOI', + 'IN_GLOSA'] + + The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf + """ + df = pd.read_csv(production_file_csv, encoding="iso-8859-1", delimiter=";") + + ddf = pd.read_csv( + detail_file_csv, encoding="iso-8859-1", delimiter=";", low_memory=False + ) + + # Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL + diff_cols = ["ID_ADD_PRODUCAO_INTELECTUAL"] + + # Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas + diff_cols.extend(list(ddf.columns.difference(df.columns))) + + # Recrie o DDF com somente as colunas diferentes + ddf2 = ddf[diff_cols] + + # Aplica o Merge dos 2 DFs + dfj = pd.merge(df, ddf2, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left") + + logger.info("Total of lines concatenates: %s" % str(dfj.shape)) + logger.info("Columns: %s" % set(dfj.columns)) + + return dfj.to_json() if json else dfj + + +@celery_app.task(name="Concatenates the author with the details of the production") +def concat_author_sucupira(djf, author_files, json=False): + """ + This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task. + + The djf is a dataframe with the columns: + + {'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'} + + The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu + + The columns of the production_file_csv: + ['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', + 'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D', + 'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', + 'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO', + 'ID_ADD_PRODUCAO_INTELECTUAL', 'NR_ORDEM', 'ID_PESSOA_DISCENTE', + 'ID_PESSOA_DOCENTE', 'ID_PARTICIPANTE_PPG_IES', + 'ID_PESSOA_PART_EXTERNO', 'ID_PESSOA_POS_DOC', 'ID_PESSOA_EGRESSO', + 'NM_AUTOR', 'TP_AUTOR', 'NM_TP_CATEGORIA_DOCENTE', 'NM_NIVEL_DISCENTE', + 'NM_ABNT_AUTOR', 'CD_AREA_CONHECIMENTO', 'NM_AREA_CONHECIMENTO', + 'ID_NATUREZA_ATUACAO', 'NM_NATUREZA_ATUACAO', 'ID_PAIS', 'NM_PAIS', + 'IN_GLOSA'] + + The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf + """ + dfas = pd.DataFrame() + + for file in author_files: + data = pd.read_csv(file, encoding="iso-8859-1", delimiter=";") + dfas = pd.concat([dfas, data], axis=0) + + dfgrupa = pd.DataFrame( + dfas.groupby(["ID_ADD_PRODUCAO_INTELECTUAL"]) + .apply( + lambda x: x[ + ["NM_AUTOR", "NM_PROGRAMA_IES", "SG_ENTIDADE_ENSINO", "NM_ABNT_AUTOR"] + ].to_dict(orient="records") + ) + .rename("DICT_AUTORES") + ).reset_index() + + djau = pd.merge(djf, dfgrupa, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left") + + logger.info("Total of authors lines concatenates: %s" % str(djau.shape)) + logger.info("Columns: %s" % set(djau.columns)) + + return djau.to_json() if json else djau + + +@celery_app.task(name="Load Sucupira data to SourceArticle") +def load_sucupira(production_file_csv, detail_file_csv, authors): + """ + This task read the sucupira_file and add the article to ``article.models.SourceArticle`` + """ + + dfau = concat_author_sucupira( + concat_article_sucupira_detail(production_file_csv, detail_file_csv), authors + ) + + _source, _ = Source.objects.get_or_create(name="SUCUPIRA") + + for index, row in dfau.iterrows(): + doi = "" if str(row["DS_DOI"]) == "nan" else row["DS_DOI"] + + # Try to fill the doi by DS_URL_DOI + if not doi: + doi = "" if str(row["DS_URL_DOI"]) == "nan" else row["DS_URL_DOI"] + + specific_id = str(row["ID_ADD_PRODUCAO_INTELECTUAL"]) + + article_source_dict = { + "doi": doi, + "specific_id": specific_id, + "year": row["AN_BASE_PRODUCAO"], + "source": _source, + "raw": row.to_json() + } + + article, created = models.SourceArticle.create_or_update( + **article_source_dict + ) + + logger.info( + "####%s####, %s, %s" + % (index.numerator, article.doi or article.specific_id, created) + ) \ No newline at end of file diff --git a/institution/scripts/load_institution.py b/institution/scripts/load_institution.py index 8c0895c5..fe86f81f 100644 --- a/institution/scripts/load_institution.py +++ b/institution/scripts/load_institution.py @@ -15,6 +15,6 @@ def run(user_id, length=None, country=None): elif user_id and length: load_institution.apply_async(args=(int(user_id), int(length))) elif user_id: - load_institution.apply_async(args=(int(user_id))) + load_institution.apply_async(args=(int(user_id), )) else: print(_("Param user_id required.")) diff --git a/requirements/base.txt b/requirements/base.txt index 55662ae1..b44e4224 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -72,4 +72,8 @@ tenacity==8.2.2 # https://pypi.org/project/tenacity/ # DRF - Yet another Swagger generator 2 # ------------------------------------------------------------------------------ -drf-yasg==1.21.5 # https://pypi.org/project/drf-yasg2/ \ No newline at end of file +drf-yasg==1.21.5 # https://pypi.org/project/drf-yasg2/ + +# Pandas +# ------------------------------------------------------------------------------ +pandas==2.0.1 # https://pandas.pydata.org/ \ No newline at end of file