From e9bd338c3427819552cf0b02f5745cb70df2db84 Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 13:16:05 -0300 Subject: [PATCH 1/7] =?UTF-8?q?Corrige=20erro=20na=20execu=C3=A7=C3=A3o=20?= =?UTF-8?q?da=20carga=20de=20institui=C3=A7=C3=B5es.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- institution/scripts/load_institution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/institution/scripts/load_institution.py b/institution/scripts/load_institution.py index 8c0895c5..fe86f81f 100644 --- a/institution/scripts/load_institution.py +++ b/institution/scripts/load_institution.py @@ -15,6 +15,6 @@ def run(user_id, length=None, country=None): elif user_id and length: load_institution.apply_async(args=(int(user_id), int(length))) elif user_id: - load_institution.apply_async(args=(int(user_id))) + load_institution.apply_async(args=(int(user_id), )) else: print(_("Param user_id required.")) From b87a46c9fc02590664e5ce145df5c82a149f2158 Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 13:16:54 -0300 Subject: [PATCH 2/7] =?UTF-8?q?Adiciona=20as=20tarefas=20de=20concatena?= =?UTF-8?q?=C3=A7=C3=A3o=20dos=20dodos=20do=20sucupira.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- article/tasks.py | 108 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/article/tasks.py b/article/tasks.py index 35be5c60..8e770e95 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -1,12 +1,13 @@ import logging +import pandas as pd from django.conf import settings from django.contrib.auth import get_user_model from django.utils.translation import gettext as _ from article import models -from core.models import Source from config import celery_app +from core.models import Source from core.utils import utils as core_utils logger = logging.getLogger(__name__) @@ -158,7 +159,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): _source, _ = Source.objects.get_or_create(name="OPENALEX") try: - flag = True + flag = True article_count = 0 while flag: @@ -185,7 +186,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): article, ) ) - article_count += 1 + article_count += 1 cursor = payload["meta"]["next_cursor"] @@ -408,3 +409,104 @@ def contributors(authors): logger.info("Article: %s, %s" % (article, created)) except Exception as e: logger.error("Erro on save article: %s" % e) + + +@celery_app.task(name="Concatenates the Sucupira intellectual production with the details of the production") +def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False): + """ + This task concate the a file with the article production in CAPES. + + The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu + + The columns of the production_file_csv: + + ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO', + 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL', + 'ID_PRODUCAO_INTELECTUAL', 'NM_PRODUCAO', 'ID_TIPO_PRODUCAO', + 'NM_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'NM_SUBTIPO_PRODUCAO', + 'ID_FORMULARIO_PRODUCAO', 'NM_FORMULARIO', 'ID_AREA_CONCENTRACAO', + 'NM_AREA_CONCENTRACAO', 'ID_LINHA_PESQUISA', 'NM_LINHA_PESQUISA', + 'ID_PROJETO', 'NM_PROJETO', 'DH_INICIO_AREA_CONC', 'DH_FIM_AREA_CONC', + 'DH_INICIO_LINHA', 'DH_FIM_LINHA', 'IN_GLOSA', + 'IN_PRODUCAO_COM_VINCULO_TCC', 'ID_ADD_TRABALHO_CONCLUSAO_CT'], + + The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf + + The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao + + The columns of the detail_file_csv: + ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO', + 'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL', + 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME', + 'DS_FASCICULO', 'NR_SERIE', 'NR_PAGINA_FINAL', 'NR_PAGINA_INICIAL', + 'DS_IDIOMA', 'DS_DIVULGACAO', 'DS_URL', 'DS_OBSERVACOES', 'NM_EDITORA', + 'NM_CIDADE', 'DS_DOI', 'DS_ISSN', 'ID_VALOR_LISTA', 'DS_URL_DOI', + 'IN_GLOSA'] + + The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf + """ + df = pd.read_csv(production_file_csv, encoding='iso-8859-1', delimiter=';') + + ddf = pd.read_csv(detail_file_csv, encoding='iso-8859-1', delimiter=';', low_memory=False) + + # Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL + diff_cols = ['ID_ADD_PRODUCAO_INTELECTUAL'] + + # Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas + diff_cols.extend(list(ddf.columns.difference(df.columns))) + + # Recrie o DDF com somente as colunas diferentes + ddf2 = ddf[diff_cols] + + # Aplica o Merge dos 2 DFs + dfj = pd.merge(df, ddf2, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left') + + logger.info("Total of lines concatenates: %s" % str(dfj.shape)) + logger.info("Columns: %s" % set(dfj.columns)) + + return dfj.to_json() if json else dfj + + +@celery_app.task(name="Concatenates the author with the details of the production") +def concat_author_sucupira(djf, author_files, json=False): + """ + This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task. + + The djf is a dataframe with the columns: + + {'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'} + + The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu + + The columns of the production_file_csv: + ['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', + 'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D', + 'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', + 'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO', + 'ID_ADD_PRODUCAO_INTELECTUAL', 'NR_ORDEM', 'ID_PESSOA_DISCENTE', + 'ID_PESSOA_DOCENTE', 'ID_PARTICIPANTE_PPG_IES', + 'ID_PESSOA_PART_EXTERNO', 'ID_PESSOA_POS_DOC', 'ID_PESSOA_EGRESSO', + 'NM_AUTOR', 'TP_AUTOR', 'NM_TP_CATEGORIA_DOCENTE', 'NM_NIVEL_DISCENTE', + 'NM_ABNT_AUTOR', 'CD_AREA_CONHECIMENTO', 'NM_AREA_CONHECIMENTO', + 'ID_NATUREZA_ATUACAO', 'NM_NATUREZA_ATUACAO', 'ID_PAIS', 'NM_PAIS', + 'IN_GLOSA'] + + The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf + """ + dfas = pd.DataFrame() + + for file in author_files: + data = pd.read_csv(file, encoding='iso-8859-1', delimiter=';') + dfas = pd.concat([dfas, data], axis=0) + + dfgrupa = pd.DataFrame(dfas.groupby(['ID_ADD_PRODUCAO_INTELECTUAL']) \ + .apply(lambda x:x[['NM_AUTOR', 'NM_PROGRAMA_IES', \ + 'SG_ENTIDADE_ENSINO', 'NM_ABNT_AUTOR']].to_dict(orient='records'))\ + .rename("DICT_AUTORES")).reset_index() + + djau = pd.merge(djf, dfgrupa, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left') + + logger.info("Total of authors lines concatenates: %s" % str(djau.shape)) + logger.info("Columns: %s" % set(djau.columns)) + + return djau.to_json() if json else djau From 5628a273ec34e5df3193fbde5e642ede839ec372 Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 13:17:14 -0300 Subject: [PATCH 3/7] =?UTF-8?q?Adiciona=20o=20script=20de=20execu=C3=A7?= =?UTF-8?q?=C3=A3o.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- article/scripts/concat_sucupira.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 article/scripts/concat_sucupira.py diff --git a/article/scripts/concat_sucupira.py b/article/scripts/concat_sucupira.py new file mode 100644 index 00000000..c7677d09 --- /dev/null +++ b/article/scripts/concat_sucupira.py @@ -0,0 +1,25 @@ +import os +from django.utils.translation import gettext as _ + +from article.tasks import concat_article_sucupira_detail, concat_author_sucupira + + +def run(production_file_csv, detail_file_csv, authors=None, sync=0, file_name="sucupira_article.csv"): + """ + Concate the a file with the article production in CAPES + """ + sync = bool(int(sync)) + authors = authors.split(",") + + if production_file_csv and detail_file_csv: + if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv): + if sync: + df = concat_article_sucupira_detail(production_file_csv, detail_file_csv) + + if authors: + ddfau = concat_author_sucupira(df, authors) + ddfau.to_csv(file_name, index=False) + else: + concat_article_sucupira_detail.apply_async(args=(production_file_csv, detail_file_csv, True)) + else: + print(_("It looks like the given path is not a file!")) \ No newline at end of file From 3f6763eebe8c4d819e70ee36126dd66a45f4d5ab Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 15:16:31 -0300 Subject: [PATCH 4/7] Altera o modelo de SourceArticle aumenta a quantidade de itens no campo DOI. --- .../migrations/0014_alter_sourcearticle_doi.py | 18 ++++++++++++++++++ article/models.py | 8 ++++---- 2 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 article/migrations/0014_alter_sourcearticle_doi.py diff --git a/article/migrations/0014_alter_sourcearticle_doi.py b/article/migrations/0014_alter_sourcearticle_doi.py new file mode 100644 index 00000000..043640fa --- /dev/null +++ b/article/migrations/0014_alter_sourcearticle_doi.py @@ -0,0 +1,18 @@ +# Generated by Django 4.1.6 on 2023-07-25 17:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("article", "0013_alter_article_sources_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="sourcearticle", + name="doi", + field=models.CharField(max_length=255, null=True, verbose_name="DOI"), + ), + ] diff --git a/article/models.py b/article/models.py index ba5045c3..fe4b7ea6 100644 --- a/article/models.py +++ b/article/models.py @@ -737,7 +737,7 @@ class SourceArticle(models.Model): is_paratext = models.BooleanField( _("Paratext"), default=False, null=True, blank=True ) - doi = models.CharField(_("DOI"), max_length=100, null=True, blank=False) + doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False) updated = models.CharField( _("Source updated date"), max_length=50, null=True, blank=False ) @@ -813,13 +813,13 @@ def get(cls, **kwargs): filters = {} - if not kwargs.get("doi") and not kwargs.get("specific_id"): + if not kwargs.get("doi") and not kwargs.get("specific_id") and not kwargs.get("source"): raise ValueError("Param doi or specific_id is required") if kwargs.get("doi"): - filters = {"doi": kwargs.get("doi")} + filters = {"doi": kwargs.get("doi"), 'source': kwargs.get("source")} elif kwargs.get("specific_id"): - filters = {"specific_id": kwargs.get("specific_id")} + filters = {"specific_id": kwargs.get("specific_id"), 'source': kwargs.get("source")} return cls.objects.get(**filters) From 06c385d009dd60a64538dc1f31d2d1740b7c4156 Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 15:17:37 -0300 Subject: [PATCH 5/7] =?UTF-8?q?Adiciona=20a=20task=20para=20carga=20do=20s?= =?UTF-8?q?ucupira=20e=20o=20script=20de=20execu=C3=A7=C3=A3o.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- article/scripts/load_sucupira.py | 21 ++++++ article/tasks.py | 110 +++++++++++++++++++++++-------- 2 files changed, 102 insertions(+), 29 deletions(-) create mode 100644 article/scripts/load_sucupira.py diff --git a/article/scripts/load_sucupira.py b/article/scripts/load_sucupira.py new file mode 100644 index 00000000..fa31d56c --- /dev/null +++ b/article/scripts/load_sucupira.py @@ -0,0 +1,21 @@ +import os +from django.utils.translation import gettext as _ + +from article.tasks import load_sucupira + + +def run(production_file_csv, detail_file_csv, authors=None, sync=0): + """ + Load the sucupira data to article.models.SourceArticle + """ + sync = bool(int(sync)) + authors = authors.split(",") + + if production_file_csv and detail_file_csv: + if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv): + if sync: + load_sucupira(production_file_csv, detail_file_csv, authors) + else: + load_sucupira.apply_async(args=(production_file_csv, detail_file_csv, authors)) + else: + print(_("It looks like the given path is not a file!")) diff --git a/article/tasks.py b/article/tasks.py index 8e770e95..a4a74248 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -34,7 +34,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): tasks.load_openalex(date=2012) - Running using a script: + Running using a script: python manage.py runscript load_openalex --script-args 1 2012 @@ -149,7 +149,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): "Department of Exercise Epidemiology, Centre for Research in Childhood Health, University of Southern Denmark, Odense, Denmark" ] } - } + } """ url = ( settings.URL_API_OPENALEX @@ -177,12 +177,14 @@ def load_openalex(user_id, date=2012, length=None, country="BR"): article["source"] = _source article["raw"] = item - article, is_created = models.SourceArticle.create_or_update(**article) + article, created = models.SourceArticle.create_or_update( + **article + ) logger.info( "%s: %s" % ( - "Created article" if is_created else "Updated article", + "Created article" if created else "Updated article", article, ) ) @@ -220,7 +222,7 @@ def contributors(authors): """ This function generate a list os contributors list. - This function get the key ``authorships`` from with this struture: + This function get the key ``authorships`` from with this struture: "authorships":[ { @@ -304,18 +306,19 @@ def contributors(authors): affs = [] for aff in au.get("raw_affiliation_strings"): - aff_obj, _ = models.Affiliation.create_or_update( **{"name": aff} ) affs.append(aff_obj) author_dict.update( - {'affiliations': affs, 'affiliations_string': au.get("raw_affiliation_string")}) + { + "affiliations": affs, + "affiliations_string": au.get("raw_affiliation_string"), + } + ) - contributor, _ = models.Contributor.create_or_update( - **author_dict - ) + contributor, _ = models.Contributor.create_or_update(**author_dict) contributors.append(contributor) @@ -324,7 +327,6 @@ def contributors(authors): # read SourceArticle for article in models.SourceArticle.objects.filter(source__name="OPENALEX"): try: - doi = article.doi # title title = core_utils.nestget(article.raw, "title") @@ -347,7 +349,9 @@ def contributors(authors): # Get the journal data if article.raw.get("primary_location"): - journal_data = core_utils.nestget(article.raw, "primary_location", "source") + journal_data = core_utils.nestget( + article.raw, "primary_location", "source" + ) if journal_data: j_issn_l = journal_data.get("issn_l") if journal_data.get("issn"): @@ -411,14 +415,16 @@ def contributors(authors): logger.error("Erro on save article: %s" % e) -@celery_app.task(name="Concatenates the Sucupira intellectual production with the details of the production") +@celery_app.task( + name="Concatenates the Sucupira intellectual production with the details of the production" +) def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False): """ This task concate the a file with the article production in CAPES. The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu - The columns of the production_file_csv: + The columns of the production_file_csv: ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL', @@ -434,7 +440,7 @@ def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=Fa The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao - The columns of the detail_file_csv: + The columns of the detail_file_csv: ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME', @@ -445,12 +451,14 @@ def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=Fa The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf """ - df = pd.read_csv(production_file_csv, encoding='iso-8859-1', delimiter=';') + df = pd.read_csv(production_file_csv, encoding="iso-8859-1", delimiter=";") - ddf = pd.read_csv(detail_file_csv, encoding='iso-8859-1', delimiter=';', low_memory=False) + ddf = pd.read_csv( + detail_file_csv, encoding="iso-8859-1", delimiter=";", low_memory=False + ) # Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL - diff_cols = ['ID_ADD_PRODUCAO_INTELECTUAL'] + diff_cols = ["ID_ADD_PRODUCAO_INTELECTUAL"] # Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas diff_cols.extend(list(ddf.columns.difference(df.columns))) @@ -459,7 +467,7 @@ def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=Fa ddf2 = ddf[diff_cols] # Aplica o Merge dos 2 DFs - dfj = pd.merge(df, ddf2, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left') + dfj = pd.merge(df, ddf2, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left") logger.info("Total of lines concatenates: %s" % str(dfj.shape)) logger.info("Columns: %s" % set(dfj.columns)) @@ -472,13 +480,13 @@ def concat_author_sucupira(djf, author_files, json=False): """ This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task. - The djf is a dataframe with the columns: + The djf is a dataframe with the columns: {'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'} - The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu + The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu - The columns of the production_file_csv: + The columns of the production_file_csv: ['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D', 'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', @@ -496,17 +504,61 @@ def concat_author_sucupira(djf, author_files, json=False): dfas = pd.DataFrame() for file in author_files: - data = pd.read_csv(file, encoding='iso-8859-1', delimiter=';') + data = pd.read_csv(file, encoding="iso-8859-1", delimiter=";") dfas = pd.concat([dfas, data], axis=0) - - dfgrupa = pd.DataFrame(dfas.groupby(['ID_ADD_PRODUCAO_INTELECTUAL']) \ - .apply(lambda x:x[['NM_AUTOR', 'NM_PROGRAMA_IES', \ - 'SG_ENTIDADE_ENSINO', 'NM_ABNT_AUTOR']].to_dict(orient='records'))\ - .rename("DICT_AUTORES")).reset_index() - djau = pd.merge(djf, dfgrupa, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left') + dfgrupa = pd.DataFrame( + dfas.groupby(["ID_ADD_PRODUCAO_INTELECTUAL"]) + .apply( + lambda x: x[ + ["NM_AUTOR", "NM_PROGRAMA_IES", "SG_ENTIDADE_ENSINO", "NM_ABNT_AUTOR"] + ].to_dict(orient="records") + ) + .rename("DICT_AUTORES") + ).reset_index() + + djau = pd.merge(djf, dfgrupa, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left") logger.info("Total of authors lines concatenates: %s" % str(djau.shape)) logger.info("Columns: %s" % set(djau.columns)) return djau.to_json() if json else djau + + +@celery_app.task(name="Load Sucupira data to SourceArticle") +def load_sucupira(production_file_csv, detail_file_csv, authors): + """ + This task read the sucupira_file and add the article to ``article.models.SourceArticle`` + """ + + dfau = concat_author_sucupira( + concat_article_sucupira_detail(production_file_csv, detail_file_csv), authors + ) + + _source, _ = Source.objects.get_or_create(name="SUCUPIRA") + + for index, row in dfau.iterrows(): + doi = "" if str(row["DS_DOI"]) == "nan" else row["DS_DOI"] + + # Try to fill the doi by DS_URL_DOI + if not doi: + doi = "" if str(row["DS_URL_DOI"]) == "nan" else row["DS_URL_DOI"] + + specific_id = str(row["ID_ADD_PRODUCAO_INTELECTUAL"]) + + article_source_dict = { + "doi": doi, + "specific_id": specific_id, + "year": row["AN_BASE_PRODUCAO"], + "source": _source, + "raw": row.to_json() + } + + article, created = models.SourceArticle.create_or_update( + **article_source_dict + ) + + logger.info( + "####%s####, %s, %s" + % (index.numerator, article.doi or article.specific_id, created) + ) \ No newline at end of file From 588d8e211836f1099eb143506cc7aecf467f994f Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 15:22:04 -0300 Subject: [PATCH 6/7] =?UTF-8?q?Adiciona=20o=20pandas=20como=20depend=C3=AA?= =?UTF-8?q?ncia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements/base.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements/base.txt b/requirements/base.txt index 55662ae1..b44e4224 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -72,4 +72,8 @@ tenacity==8.2.2 # https://pypi.org/project/tenacity/ # DRF - Yet another Swagger generator 2 # ------------------------------------------------------------------------------ -drf-yasg==1.21.5 # https://pypi.org/project/drf-yasg2/ \ No newline at end of file +drf-yasg==1.21.5 # https://pypi.org/project/drf-yasg2/ + +# Pandas +# ------------------------------------------------------------------------------ +pandas==2.0.1 # https://pandas.pydata.org/ \ No newline at end of file From ae58714bb8161e0083ba333ba2bc74e7d819ef07 Mon Sep 17 00:00:00 2001 From: GitInno <86991526+gitnnolabs@users.noreply.github.com> Date: Tue, 25 Jul 2023 17:10:13 -0300 Subject: [PATCH 7/7] Aumenta a quantidade de caracteres do DOI. --- article/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article/models.py b/article/models.py index fe4b7ea6..f42ced24 100644 --- a/article/models.py +++ b/article/models.py @@ -734,10 +734,10 @@ class SourceArticle(models.Model): _("Specific Id"), max_length=255, null=False, blank=False ) year = models.CharField(_("Year"), max_length=10, null=True, blank=True) + doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False) is_paratext = models.BooleanField( _("Paratext"), default=False, null=True, blank=True ) - doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False) updated = models.CharField( _("Source updated date"), max_length=50, null=True, blank=False )