Merge pull request #228 from gitnnolabs/tk199-refactory

WIP: Adiciona a preparação dos dado do sucupira e carga para os dados do sucupira.
scieloorg · Jul 25, 2023 · 701296a · 701296a
2 parents 3db437e + ae58714
commit 701296a
Show file tree

Hide file tree

Showing 7 changed files with 243 additions and 21 deletions.
diff --git a/article/migrations/0014_alter_sourcearticle_doi.py b/article/migrations/0014_alter_sourcearticle_doi.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.6 on 2023-07-25 17:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("article", "0013_alter_article_sources_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="sourcearticle",
+            name="doi",
+            field=models.CharField(max_length=255, null=True, verbose_name="DOI"),
+        ),
+    ]
diff --git a/article/models.py b/article/models.py
@@ -734,10 +734,10 @@ class SourceArticle(models.Model):
         _("Specific Id"), max_length=255, null=False, blank=False
     )
     year = models.CharField(_("Year"), max_length=10, null=True, blank=True)
+    doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False)
     is_paratext = models.BooleanField(
         _("Paratext"), default=False, null=True, blank=True
     )
-    doi = models.CharField(_("DOI"), max_length=100, null=True, blank=False)
     updated = models.CharField(
         _("Source updated date"), max_length=50, null=True, blank=False
     )
@@ -813,13 +813,13 @@ def get(cls, **kwargs):
 
         filters = {}
 
-        if not kwargs.get("doi") and not kwargs.get("specific_id"):
+        if not kwargs.get("doi") and not kwargs.get("specific_id") and not kwargs.get("source"):
             raise ValueError("Param doi or specific_id is required")
 
         if kwargs.get("doi"):
-            filters = {"doi": kwargs.get("doi")}
+            filters = {"doi": kwargs.get("doi"), 'source': kwargs.get("source")}
         elif kwargs.get("specific_id"):
-            filters = {"specific_id": kwargs.get("specific_id")}
+            filters = {"specific_id": kwargs.get("specific_id"), 'source': kwargs.get("source")}
 
         return cls.objects.get(**filters)
 

diff --git a/article/scripts/concat_sucupira.py b/article/scripts/concat_sucupira.py
@@ -0,0 +1,25 @@
+import os 
+from django.utils.translation import gettext as _
+
+from article.tasks import concat_article_sucupira_detail, concat_author_sucupira
+
+
+def run(production_file_csv, detail_file_csv, authors=None, sync=0, file_name="sucupira_article.csv"):
+    """
+    Concate the a file with the article production in CAPES
+    """
+    sync = bool(int(sync))
+    authors = authors.split(",")
+
+    if production_file_csv and detail_file_csv:
+        if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv):
+            if sync: 
+                df = concat_article_sucupira_detail(production_file_csv, detail_file_csv)
+
+                if authors:
+                    ddfau = concat_author_sucupira(df, authors)
+                    ddfau.to_csv(file_name, index=False)
+            else:
+                concat_article_sucupira_detail.apply_async(args=(production_file_csv, detail_file_csv, True))
+        else:
+            print(_("It looks like the given path is not a file!"))
diff --git a/article/scripts/load_sucupira.py b/article/scripts/load_sucupira.py
@@ -0,0 +1,21 @@
+import os
+from django.utils.translation import gettext as _
+
+from article.tasks import load_sucupira
+
+
+def run(production_file_csv, detail_file_csv, authors=None, sync=0):
+    """
+    Load the sucupira data to article.models.SourceArticle
+    """
+    sync = bool(int(sync))
+    authors = authors.split(",")
+
+    if production_file_csv and detail_file_csv:
+        if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv):
+            if sync: 
+                load_sucupira(production_file_csv, detail_file_csv, authors) 
+            else:
+                load_sucupira.apply_async(args=(production_file_csv, detail_file_csv, authors))
+        else:
+            print(_("It looks like the given path is not a file!"))
diff --git a/article/tasks.py b/article/tasks.py
@@ -1,12 +1,13 @@
 import logging
 
+import pandas as pd
 from django.conf import settings
 from django.contrib.auth import get_user_model
 from django.utils.translation import gettext as _
 
 from article import models
-from core.models import Source
 from config import celery_app
+from core.models import Source
 from core.utils import utils as core_utils
 
 logger = logging.getLogger(__name__)
@@ -33,7 +34,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
         tasks.load_openalex(date=2012)
 
 
-    Running using a script: 
+    Running using a script:
 
     python manage.py runscript load_openalex --script-args 1 2012
 
@@ -148,7 +149,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
                      "Department of Exercise Epidemiology, Centre for Research in Childhood Health, University of Southern Denmark, Odense, Denmark"
                   ]
                }
-        } 
+        }
     """
     url = (
         settings.URL_API_OPENALEX
@@ -158,7 +159,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
     _source, _ = Source.objects.get_or_create(name="OPENALEX")
 
     try:
-        flag  = True
+        flag = True
         article_count = 0
 
         while flag:
@@ -176,16 +177,18 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
                     article["source"] = _source
                     article["raw"] = item
 
-                    article, is_created = models.SourceArticle.create_or_update(**article)
+                    article, created = models.SourceArticle.create_or_update(
+                        **article
+                    )
 
                     logger.info(
                         "%s: %s"
                         % (
-                            "Created article" if is_created else "Updated article",
+                            "Created article" if created else "Updated article",
                             article,
                         )
                     )
-                    article_count += 1 
+                    article_count += 1
 
                 cursor = payload["meta"]["next_cursor"]
 
@@ -219,7 +222,7 @@ def contributors(authors):
         """
         This function generate a list os contributors list.
 
-        This function get the key ``authorships`` from with this struture: 
+        This function get the key ``authorships`` from with this struture:
 
              "authorships":[
                {
@@ -303,18 +306,19 @@ def contributors(authors):
                         affs = []
 
                         for aff in au.get("raw_affiliation_strings"):
-
                             aff_obj, _ = models.Affiliation.create_or_update(
                                 **{"name": aff}
                             )
                             affs.append(aff_obj)
 
                         author_dict.update(
-                            {'affiliations': affs, 'affiliations_string': au.get("raw_affiliation_string")})
+                            {
+                                "affiliations": affs,
+                                "affiliations_string": au.get("raw_affiliation_string"),
+                            }
+                        )
 
-                    contributor, _ = models.Contributor.create_or_update(
-                        **author_dict
-                    )
+                    contributor, _ = models.Contributor.create_or_update(**author_dict)
 
                     contributors.append(contributor)
 
@@ -323,7 +327,6 @@ def contributors(authors):
     # read SourceArticle
     for article in models.SourceArticle.objects.filter(source__name="OPENALEX"):
         try:
-
             doi = article.doi
             # title
             title = core_utils.nestget(article.raw, "title")
@@ -346,7 +349,9 @@ def contributors(authors):
 
             # Get the journal data
             if article.raw.get("primary_location"):
-                journal_data = core_utils.nestget(article.raw, "primary_location", "source")
+                journal_data = core_utils.nestget(
+                    article.raw, "primary_location", "source"
+                )
                 if journal_data:
                     j_issn_l = journal_data.get("issn_l")
                     if journal_data.get("issn"):
@@ -408,3 +413,152 @@ def contributors(authors):
             logger.info("Article: %s, %s" % (article, created))
         except Exception as e:
             logger.error("Erro on save article: %s" % e)
+
+
+@celery_app.task(
+    name="Concatenates the Sucupira intellectual production with the details of the production"
+)
+def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False):
+    """
+    This task concate the a file with the article production in CAPES.
+
+    The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
+
+    The columns of the production_file_csv:
+
+        ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
+         'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL',
+         'ID_PRODUCAO_INTELECTUAL', 'NM_PRODUCAO', 'ID_TIPO_PRODUCAO',
+         'NM_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'NM_SUBTIPO_PRODUCAO',
+         'ID_FORMULARIO_PRODUCAO', 'NM_FORMULARIO', 'ID_AREA_CONCENTRACAO',
+         'NM_AREA_CONCENTRACAO', 'ID_LINHA_PESQUISA', 'NM_LINHA_PESQUISA',
+         'ID_PROJETO', 'NM_PROJETO', 'DH_INICIO_AREA_CONC', 'DH_FIM_AREA_CONC',
+         'DH_INICIO_LINHA', 'DH_FIM_LINHA', 'IN_GLOSA',
+         'IN_PRODUCAO_COM_VINCULO_TCC', 'ID_ADD_TRABALHO_CONCLUSAO_CT'],
+
+    The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf
+
+    The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao
+
+    The columns of the detail_file_csv:
+        ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
+         'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL',
+         'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME',
+         'DS_FASCICULO', 'NR_SERIE', 'NR_PAGINA_FINAL', 'NR_PAGINA_INICIAL',
+         'DS_IDIOMA', 'DS_DIVULGACAO', 'DS_URL', 'DS_OBSERVACOES', 'NM_EDITORA',
+         'NM_CIDADE', 'DS_DOI', 'DS_ISSN', 'ID_VALOR_LISTA', 'DS_URL_DOI',
+         'IN_GLOSA']
+
+    The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf
+    """
+    df = pd.read_csv(production_file_csv, encoding="iso-8859-1", delimiter=";")
+
+    ddf = pd.read_csv(
+        detail_file_csv, encoding="iso-8859-1", delimiter=";", low_memory=False
+    )
+
+    # Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL
+    diff_cols = ["ID_ADD_PRODUCAO_INTELECTUAL"]
+
+    # Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas
+    diff_cols.extend(list(ddf.columns.difference(df.columns)))
+
+    # Recrie o DDF com somente as colunas diferentes
+    ddf2 = ddf[diff_cols]
+
+    # Aplica o Merge dos 2 DFs
+    dfj = pd.merge(df, ddf2, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left")
+
+    logger.info("Total of lines concatenates: %s" % str(dfj.shape))
+    logger.info("Columns: %s" % set(dfj.columns))
+
+    return dfj.to_json() if json else dfj
+
+
+@celery_app.task(name="Concatenates the author with the details of the production")
+def concat_author_sucupira(djf, author_files, json=False):
+    """
+    This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task.
+
+    The djf is a dataframe with the columns:
+
+        {'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'}
+
+    The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
+
+    The columns of the production_file_csv:
+        ['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO',
+         'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D',
+         'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES',
+         'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO',
+         'ID_ADD_PRODUCAO_INTELECTUAL', 'NR_ORDEM', 'ID_PESSOA_DISCENTE',
+         'ID_PESSOA_DOCENTE', 'ID_PARTICIPANTE_PPG_IES',
+         'ID_PESSOA_PART_EXTERNO', 'ID_PESSOA_POS_DOC', 'ID_PESSOA_EGRESSO',
+         'NM_AUTOR', 'TP_AUTOR', 'NM_TP_CATEGORIA_DOCENTE', 'NM_NIVEL_DISCENTE',
+         'NM_ABNT_AUTOR', 'CD_AREA_CONHECIMENTO', 'NM_AREA_CONHECIMENTO',
+         'ID_NATUREZA_ATUACAO', 'NM_NATUREZA_ATUACAO', 'ID_PAIS', 'NM_PAIS',
+         'IN_GLOSA']
+
+    The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf
+    """
+    dfas = pd.DataFrame()
+
+    for file in author_files:
+        data = pd.read_csv(file, encoding="iso-8859-1", delimiter=";")
+        dfas = pd.concat([dfas, data], axis=0)
+
+    dfgrupa = pd.DataFrame(
+        dfas.groupby(["ID_ADD_PRODUCAO_INTELECTUAL"])
+        .apply(
+            lambda x: x[
+                ["NM_AUTOR", "NM_PROGRAMA_IES", "SG_ENTIDADE_ENSINO", "NM_ABNT_AUTOR"]
+            ].to_dict(orient="records")
+        )
+        .rename("DICT_AUTORES")
+    ).reset_index()
+
+    djau = pd.merge(djf, dfgrupa, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left")
+
+    logger.info("Total of authors lines concatenates: %s" % str(djau.shape))
+    logger.info("Columns: %s" % set(djau.columns))
+
+    return djau.to_json() if json else djau
+
+
+@celery_app.task(name="Load Sucupira data to SourceArticle")
+def load_sucupira(production_file_csv, detail_file_csv, authors):
+    """
+    This task read the sucupira_file and add the article to ``article.models.SourceArticle``
+    """
+
+    dfau = concat_author_sucupira(
+        concat_article_sucupira_detail(production_file_csv, detail_file_csv), authors
+    )
+
+    _source, _ = Source.objects.get_or_create(name="SUCUPIRA")
+
+    for index, row in dfau.iterrows():
+        doi = "" if str(row["DS_DOI"]) == "nan" else row["DS_DOI"]
+
+        # Try to fill the doi by DS_URL_DOI
+        if not doi:
+            doi = "" if str(row["DS_URL_DOI"]) == "nan" else row["DS_URL_DOI"]
+
+        specific_id = str(row["ID_ADD_PRODUCAO_INTELECTUAL"])
+
+        article_source_dict = {
+            "doi": doi,
+            "specific_id": specific_id,
+            "year": row["AN_BASE_PRODUCAO"],
+            "source": _source,
+            "raw": row.to_json()
+        }
+
+        article, created = models.SourceArticle.create_or_update(
+            **article_source_dict
+        )
+
+        logger.info(
+            "####%s####, %s, %s"
+            % (index.numerator, article.doi or article.specific_id, created)
+        )
diff --git a/institution/scripts/load_institution.py b/institution/scripts/load_institution.py
@@ -15,6 +15,6 @@ def run(user_id, length=None, country=None):
     elif user_id and length:
         load_institution.apply_async(args=(int(user_id), int(length)))
     elif user_id:
-        load_institution.apply_async(args=(int(user_id)))
+        load_institution.apply_async(args=(int(user_id), ))
     else:
         print(_("Param user_id required."))
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -72,4 +72,8 @@ tenacity==8.2.2  # https://pypi.org/project/tenacity/
 
 # DRF - Yet another Swagger generator 2
 # ------------------------------------------------------------------------------
-drf-yasg==1.21.5  # https://pypi.org/project/drf-yasg2/
+drf-yasg==1.21.5  # https://pypi.org/project/drf-yasg2/
+
+# Pandas
+# ------------------------------------------------------------------------------
+pandas==2.0.1  # https://pandas.pydata.org/