From e9bd338c3427819552cf0b02f5745cb70df2db84 Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:16:05 -0300
Subject: [PATCH 1/7] =?UTF-8?q?Corrige=20erro=20na=20execu=C3=A7=C3=A3o=20?=
 =?UTF-8?q?da=20carga=20de=20institui=C3=A7=C3=B5es.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 institution/scripts/load_institution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/institution/scripts/load_institution.py b/institution/scripts/load_institution.py
index 8c0895c5..fe86f81f 100644
--- a/institution/scripts/load_institution.py
+++ b/institution/scripts/load_institution.py
@@ -15,6 +15,6 @@ def run(user_id, length=None, country=None):
     elif user_id and length:
         load_institution.apply_async(args=(int(user_id), int(length)))
     elif user_id:
-        load_institution.apply_async(args=(int(user_id)))
+        load_institution.apply_async(args=(int(user_id), ))
     else:
         print(_("Param user_id required."))

From b87a46c9fc02590664e5ce145df5c82a149f2158 Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:16:54 -0300
Subject: [PATCH 2/7] =?UTF-8?q?Adiciona=20as=20tarefas=20de=20concatena?=
 =?UTF-8?q?=C3=A7=C3=A3o=20dos=20dodos=20do=20sucupira.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 article/tasks.py | 108 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 3 deletions(-)

diff --git a/article/tasks.py b/article/tasks.py
index 35be5c60..8e770e95 100644
--- a/article/tasks.py
+++ b/article/tasks.py
@@ -1,12 +1,13 @@
 import logging
 
+import pandas as pd
 from django.conf import settings
 from django.contrib.auth import get_user_model
 from django.utils.translation import gettext as _
 
 from article import models
-from core.models import Source
 from config import celery_app
+from core.models import Source
 from core.utils import utils as core_utils
 
 logger = logging.getLogger(__name__)
@@ -158,7 +159,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
     _source, _ = Source.objects.get_or_create(name="OPENALEX")
 
     try:
-        flag  = True
+        flag = True
         article_count = 0
 
         while flag:
@@ -185,7 +186,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
                             article,
                         )
                     )
-                    article_count += 1 
+                    article_count += 1
 
                 cursor = payload["meta"]["next_cursor"]
 
@@ -408,3 +409,104 @@ def contributors(authors):
             logger.info("Article: %s, %s" % (article, created))
         except Exception as e:
             logger.error("Erro on save article: %s" % e)
+
+
+@celery_app.task(name="Concatenates the Sucupira intellectual production with the details of the production")
+def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False):
+    """
+    This task concate the a file with the article production in CAPES.
+
+    The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
+
+    The columns of the production_file_csv: 
+
+        ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
+         'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL',
+         'ID_PRODUCAO_INTELECTUAL', 'NM_PRODUCAO', 'ID_TIPO_PRODUCAO',
+         'NM_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'NM_SUBTIPO_PRODUCAO',
+         'ID_FORMULARIO_PRODUCAO', 'NM_FORMULARIO', 'ID_AREA_CONCENTRACAO',
+         'NM_AREA_CONCENTRACAO', 'ID_LINHA_PESQUISA', 'NM_LINHA_PESQUISA',
+         'ID_PROJETO', 'NM_PROJETO', 'DH_INICIO_AREA_CONC', 'DH_FIM_AREA_CONC',
+         'DH_INICIO_LINHA', 'DH_FIM_LINHA', 'IN_GLOSA',
+         'IN_PRODUCAO_COM_VINCULO_TCC', 'ID_ADD_TRABALHO_CONCLUSAO_CT'],
+
+    The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf
+
+    The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao
+
+    The columns of the detail_file_csv: 
+        ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
+         'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL',
+         'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME',
+         'DS_FASCICULO', 'NR_SERIE', 'NR_PAGINA_FINAL', 'NR_PAGINA_INICIAL',
+         'DS_IDIOMA', 'DS_DIVULGACAO', 'DS_URL', 'DS_OBSERVACOES', 'NM_EDITORA',
+         'NM_CIDADE', 'DS_DOI', 'DS_ISSN', 'ID_VALOR_LISTA', 'DS_URL_DOI',
+         'IN_GLOSA']
+
+    The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf
+    """
+    df = pd.read_csv(production_file_csv, encoding='iso-8859-1', delimiter=';')
+
+    ddf = pd.read_csv(detail_file_csv, encoding='iso-8859-1', delimiter=';', low_memory=False)
+
+    # Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL
+    diff_cols = ['ID_ADD_PRODUCAO_INTELECTUAL']
+
+    # Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas
+    diff_cols.extend(list(ddf.columns.difference(df.columns)))
+
+    # Recrie o DDF com somente as colunas diferentes
+    ddf2 = ddf[diff_cols]
+
+    # Aplica o Merge dos 2 DFs
+    dfj = pd.merge(df, ddf2, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left')
+
+    logger.info("Total of lines concatenates: %s" % str(dfj.shape))
+    logger.info("Columns: %s" % set(dfj.columns))
+
+    return dfj.to_json() if json else dfj
+
+
+@celery_app.task(name="Concatenates the author with the details of the production")
+def concat_author_sucupira(djf, author_files, json=False):
+    """
+    This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task.
+
+    The djf is a dataframe with the columns: 
+
+        {'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'}
+
+    The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu 
+
+    The columns of the production_file_csv: 
+        ['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO',
+         'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D',
+         'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES',
+         'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO',
+         'ID_ADD_PRODUCAO_INTELECTUAL', 'NR_ORDEM', 'ID_PESSOA_DISCENTE',
+         'ID_PESSOA_DOCENTE', 'ID_PARTICIPANTE_PPG_IES',
+         'ID_PESSOA_PART_EXTERNO', 'ID_PESSOA_POS_DOC', 'ID_PESSOA_EGRESSO',
+         'NM_AUTOR', 'TP_AUTOR', 'NM_TP_CATEGORIA_DOCENTE', 'NM_NIVEL_DISCENTE',
+         'NM_ABNT_AUTOR', 'CD_AREA_CONHECIMENTO', 'NM_AREA_CONHECIMENTO',
+         'ID_NATUREZA_ATUACAO', 'NM_NATUREZA_ATUACAO', 'ID_PAIS', 'NM_PAIS',
+         'IN_GLOSA']
+
+    The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf
+    """
+    dfas = pd.DataFrame()
+
+    for file in author_files:
+        data = pd.read_csv(file, encoding='iso-8859-1', delimiter=';')
+        dfas = pd.concat([dfas, data], axis=0)
+    
+    dfgrupa = pd.DataFrame(dfas.groupby(['ID_ADD_PRODUCAO_INTELECTUAL']) \
+                .apply(lambda x:x[['NM_AUTOR', 'NM_PROGRAMA_IES', \
+                                            'SG_ENTIDADE_ENSINO', 'NM_ABNT_AUTOR']].to_dict(orient='records'))\
+                        .rename("DICT_AUTORES")).reset_index()
+
+    djau = pd.merge(djf, dfgrupa, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left')
+
+    logger.info("Total of authors lines concatenates: %s" % str(djau.shape))
+    logger.info("Columns: %s" % set(djau.columns))
+
+    return djau.to_json() if json else djau

From 5628a273ec34e5df3193fbde5e642ede839ec372 Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:17:14 -0300
Subject: [PATCH 3/7] =?UTF-8?q?Adiciona=20o=20script=20de=20execu=C3=A7?=
 =?UTF-8?q?=C3=A3o.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 article/scripts/concat_sucupira.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 article/scripts/concat_sucupira.py

diff --git a/article/scripts/concat_sucupira.py b/article/scripts/concat_sucupira.py
new file mode 100644
index 00000000..c7677d09
--- /dev/null
+++ b/article/scripts/concat_sucupira.py
@@ -0,0 +1,25 @@
+import os 
+from django.utils.translation import gettext as _
+
+from article.tasks import concat_article_sucupira_detail, concat_author_sucupira
+
+
+def run(production_file_csv, detail_file_csv, authors=None, sync=0, file_name="sucupira_article.csv"):
+    """
+    Concate the a file with the article production in CAPES
+    """
+    sync = bool(int(sync))
+    authors = authors.split(",")
+
+    if production_file_csv and detail_file_csv:
+        if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv):
+            if sync: 
+                df = concat_article_sucupira_detail(production_file_csv, detail_file_csv)
+                
+                if authors:
+                    ddfau = concat_author_sucupira(df, authors)
+                    ddfau.to_csv(file_name, index=False)
+            else:
+                concat_article_sucupira_detail.apply_async(args=(production_file_csv, detail_file_csv, True))
+        else:
+            print(_("It looks like the given path is not a file!"))
\ No newline at end of file

From 3f6763eebe8c4d819e70ee36126dd66a45f4d5ab Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 15:16:31 -0300
Subject: [PATCH 4/7] Altera o modelo de SourceArticle aumenta a quantidade de
 itens no campo DOI.

---
 .../migrations/0014_alter_sourcearticle_doi.py | 18 ++++++++++++++++++
 article/models.py                              |  8 ++++----
 2 files changed, 22 insertions(+), 4 deletions(-)
 create mode 100644 article/migrations/0014_alter_sourcearticle_doi.py

diff --git a/article/migrations/0014_alter_sourcearticle_doi.py b/article/migrations/0014_alter_sourcearticle_doi.py
new file mode 100644
index 00000000..043640fa
--- /dev/null
+++ b/article/migrations/0014_alter_sourcearticle_doi.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.6 on 2023-07-25 17:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("article", "0013_alter_article_sources_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="sourcearticle",
+            name="doi",
+            field=models.CharField(max_length=255, null=True, verbose_name="DOI"),
+        ),
+    ]
diff --git a/article/models.py b/article/models.py
index ba5045c3..fe4b7ea6 100644
--- a/article/models.py
+++ b/article/models.py
@@ -737,7 +737,7 @@ class SourceArticle(models.Model):
     is_paratext = models.BooleanField(
         _("Paratext"), default=False, null=True, blank=True
     )
-    doi = models.CharField(_("DOI"), max_length=100, null=True, blank=False)
+    doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False)
     updated = models.CharField(
         _("Source updated date"), max_length=50, null=True, blank=False
     )
@@ -813,13 +813,13 @@ def get(cls, **kwargs):
 
         filters = {}
 
-        if not kwargs.get("doi") and not kwargs.get("specific_id"):
+        if not kwargs.get("doi") and not kwargs.get("specific_id") and not kwargs.get("source"):
             raise ValueError("Param doi or specific_id is required")
 
         if kwargs.get("doi"):
-            filters = {"doi": kwargs.get("doi")}
+            filters = {"doi": kwargs.get("doi"), 'source': kwargs.get("source")}
         elif kwargs.get("specific_id"):
-            filters = {"specific_id": kwargs.get("specific_id")}
+            filters = {"specific_id": kwargs.get("specific_id"), 'source': kwargs.get("source")}
 
         return cls.objects.get(**filters)
 

From 06c385d009dd60a64538dc1f31d2d1740b7c4156 Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 15:17:37 -0300
Subject: [PATCH 5/7] =?UTF-8?q?Adiciona=20a=20task=20para=20carga=20do=20s?=
 =?UTF-8?q?ucupira=20e=20o=20script=20de=20execu=C3=A7=C3=A3o.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 article/scripts/load_sucupira.py |  21 ++++++
 article/tasks.py                 | 110 +++++++++++++++++++++++--------
 2 files changed, 102 insertions(+), 29 deletions(-)
 create mode 100644 article/scripts/load_sucupira.py

diff --git a/article/scripts/load_sucupira.py b/article/scripts/load_sucupira.py
new file mode 100644
index 00000000..fa31d56c
--- /dev/null
+++ b/article/scripts/load_sucupira.py
@@ -0,0 +1,21 @@
+import os
+from django.utils.translation import gettext as _
+
+from article.tasks import load_sucupira
+
+
+def run(production_file_csv, detail_file_csv, authors=None, sync=0):
+    """
+    Load the sucupira data to article.models.SourceArticle
+    """
+    sync = bool(int(sync))
+    authors = authors.split(",")
+
+    if production_file_csv and detail_file_csv:
+        if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv):
+            if sync: 
+                load_sucupira(production_file_csv, detail_file_csv, authors) 
+            else:
+                load_sucupira.apply_async(args=(production_file_csv, detail_file_csv, authors))
+        else:
+            print(_("It looks like the given path is not a file!"))
diff --git a/article/tasks.py b/article/tasks.py
index 8e770e95..a4a74248 100644
--- a/article/tasks.py
+++ b/article/tasks.py
@@ -34,7 +34,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
         tasks.load_openalex(date=2012)
 
 
-    Running using a script: 
+    Running using a script:
 
     python manage.py runscript load_openalex --script-args 1 2012
 
@@ -149,7 +149,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
                      "Department of Exercise Epidemiology, Centre for Research in Childhood Health, University of Southern Denmark, Odense, Denmark"
                   ]
                }
-        } 
+        }
     """
     url = (
         settings.URL_API_OPENALEX
@@ -177,12 +177,14 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
                     article["source"] = _source
                     article["raw"] = item
 
-                    article, is_created = models.SourceArticle.create_or_update(**article)
+                    article, created = models.SourceArticle.create_or_update(
+                        **article
+                    )
 
                     logger.info(
                         "%s: %s"
                         % (
-                            "Created article" if is_created else "Updated article",
+                            "Created article" if created else "Updated article",
                             article,
                         )
                     )
@@ -220,7 +222,7 @@ def contributors(authors):
         """
         This function generate a list os contributors list.
 
-        This function get the key ``authorships`` from with this struture: 
+        This function get the key ``authorships`` from with this struture:
 
              "authorships":[
                {
@@ -304,18 +306,19 @@ def contributors(authors):
                         affs = []
 
                         for aff in au.get("raw_affiliation_strings"):
-
                             aff_obj, _ = models.Affiliation.create_or_update(
                                 **{"name": aff}
                             )
                             affs.append(aff_obj)
 
                         author_dict.update(
-                            {'affiliations': affs, 'affiliations_string': au.get("raw_affiliation_string")})
+                            {
+                                "affiliations": affs,
+                                "affiliations_string": au.get("raw_affiliation_string"),
+                            }
+                        )
 
-                    contributor, _ = models.Contributor.create_or_update(
-                        **author_dict
-                    )
+                    contributor, _ = models.Contributor.create_or_update(**author_dict)
 
                     contributors.append(contributor)
 
@@ -324,7 +327,6 @@ def contributors(authors):
     # read SourceArticle
     for article in models.SourceArticle.objects.filter(source__name="OPENALEX"):
         try:
-
             doi = article.doi
             # title
             title = core_utils.nestget(article.raw, "title")
@@ -347,7 +349,9 @@ def contributors(authors):
 
             # Get the journal data
             if article.raw.get("primary_location"):
-                journal_data = core_utils.nestget(article.raw, "primary_location", "source")
+                journal_data = core_utils.nestget(
+                    article.raw, "primary_location", "source"
+                )
                 if journal_data:
                     j_issn_l = journal_data.get("issn_l")
                     if journal_data.get("issn"):
@@ -411,14 +415,16 @@ def contributors(authors):
             logger.error("Erro on save article: %s" % e)
 
 
-@celery_app.task(name="Concatenates the Sucupira intellectual production with the details of the production")
+@celery_app.task(
+    name="Concatenates the Sucupira intellectual production with the details of the production"
+)
 def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False):
     """
     This task concate the a file with the article production in CAPES.
 
     The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
 
-    The columns of the production_file_csv: 
+    The columns of the production_file_csv:
 
         ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
          'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL',
@@ -434,7 +440,7 @@ def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=Fa
 
     The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao
 
-    The columns of the detail_file_csv: 
+    The columns of the detail_file_csv:
         ['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
          'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL',
          'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME',
@@ -445,12 +451,14 @@ def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=Fa
 
     The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf
     """
-    df = pd.read_csv(production_file_csv, encoding='iso-8859-1', delimiter=';')
+    df = pd.read_csv(production_file_csv, encoding="iso-8859-1", delimiter=";")
 
-    ddf = pd.read_csv(detail_file_csv, encoding='iso-8859-1', delimiter=';', low_memory=False)
+    ddf = pd.read_csv(
+        detail_file_csv, encoding="iso-8859-1", delimiter=";", low_memory=False
+    )
 
     # Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL
-    diff_cols = ['ID_ADD_PRODUCAO_INTELECTUAL']
+    diff_cols = ["ID_ADD_PRODUCAO_INTELECTUAL"]
 
     # Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas
     diff_cols.extend(list(ddf.columns.difference(df.columns)))
@@ -459,7 +467,7 @@ def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=Fa
     ddf2 = ddf[diff_cols]
 
     # Aplica o Merge dos 2 DFs
-    dfj = pd.merge(df, ddf2, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left')
+    dfj = pd.merge(df, ddf2, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left")
 
     logger.info("Total of lines concatenates: %s" % str(dfj.shape))
     logger.info("Columns: %s" % set(dfj.columns))
@@ -472,13 +480,13 @@ def concat_author_sucupira(djf, author_files, json=False):
     """
     This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task.
 
-    The djf is a dataframe with the columns: 
+    The djf is a dataframe with the columns:
 
         {'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'}
 
-    The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu 
+    The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
 
-    The columns of the production_file_csv: 
+    The columns of the production_file_csv:
         ['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO',
          'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D',
          'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES',
@@ -496,17 +504,61 @@ def concat_author_sucupira(djf, author_files, json=False):
     dfas = pd.DataFrame()
 
     for file in author_files:
-        data = pd.read_csv(file, encoding='iso-8859-1', delimiter=';')
+        data = pd.read_csv(file, encoding="iso-8859-1", delimiter=";")
         dfas = pd.concat([dfas, data], axis=0)
-    
-    dfgrupa = pd.DataFrame(dfas.groupby(['ID_ADD_PRODUCAO_INTELECTUAL']) \
-                .apply(lambda x:x[['NM_AUTOR', 'NM_PROGRAMA_IES', \
-                                            'SG_ENTIDADE_ENSINO', 'NM_ABNT_AUTOR']].to_dict(orient='records'))\
-                        .rename("DICT_AUTORES")).reset_index()
 
-    djau = pd.merge(djf, dfgrupa, on='ID_ADD_PRODUCAO_INTELECTUAL', how='left')
+    dfgrupa = pd.DataFrame(
+        dfas.groupby(["ID_ADD_PRODUCAO_INTELECTUAL"])
+        .apply(
+            lambda x: x[
+                ["NM_AUTOR", "NM_PROGRAMA_IES", "SG_ENTIDADE_ENSINO", "NM_ABNT_AUTOR"]
+            ].to_dict(orient="records")
+        )
+        .rename("DICT_AUTORES")
+    ).reset_index()
+
+    djau = pd.merge(djf, dfgrupa, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left")
 
     logger.info("Total of authors lines concatenates: %s" % str(djau.shape))
     logger.info("Columns: %s" % set(djau.columns))
 
     return djau.to_json() if json else djau
+
+
+@celery_app.task(name="Load Sucupira data to SourceArticle")
+def load_sucupira(production_file_csv, detail_file_csv, authors):
+    """
+    This task read the sucupira_file and add the article to ``article.models.SourceArticle``
+    """
+
+    dfau = concat_author_sucupira(
+        concat_article_sucupira_detail(production_file_csv, detail_file_csv), authors
+    )
+
+    _source, _ = Source.objects.get_or_create(name="SUCUPIRA")
+
+    for index, row in dfau.iterrows():
+        doi = "" if str(row["DS_DOI"]) == "nan" else row["DS_DOI"]
+
+        # Try to fill the doi by DS_URL_DOI
+        if not doi:
+            doi = "" if str(row["DS_URL_DOI"]) == "nan" else row["DS_URL_DOI"]
+
+        specific_id = str(row["ID_ADD_PRODUCAO_INTELECTUAL"])
+
+        article_source_dict = {
+            "doi": doi,
+            "specific_id": specific_id,
+            "year": row["AN_BASE_PRODUCAO"],
+            "source": _source,
+            "raw": row.to_json()
+        }
+
+        article, created = models.SourceArticle.create_or_update(
+            **article_source_dict
+        )
+
+        logger.info(
+            "####%s####, %s, %s"
+            % (index.numerator, article.doi or article.specific_id, created)
+        )
\ No newline at end of file

From 588d8e211836f1099eb143506cc7aecf467f994f Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 15:22:04 -0300
Subject: [PATCH 6/7] =?UTF-8?q?Adiciona=20o=20pandas=20como=20depend=C3=AA?=
 =?UTF-8?q?ncia?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements/base.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/requirements/base.txt b/requirements/base.txt
index 55662ae1..b44e4224 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -72,4 +72,8 @@ tenacity==8.2.2  # https://pypi.org/project/tenacity/
 
 # DRF - Yet another Swagger generator 2
 # ------------------------------------------------------------------------------
-drf-yasg==1.21.5  # https://pypi.org/project/drf-yasg2/
\ No newline at end of file
+drf-yasg==1.21.5  # https://pypi.org/project/drf-yasg2/
+
+# Pandas
+# ------------------------------------------------------------------------------
+pandas==2.0.1  # https://pandas.pydata.org/
\ No newline at end of file

From ae58714bb8161e0083ba333ba2bc74e7d819ef07 Mon Sep 17 00:00:00 2001
From: GitInno <86991526+gitnnolabs@users.noreply.github.com>
Date: Tue, 25 Jul 2023 17:10:13 -0300
Subject: [PATCH 7/7] Aumenta a quantidade de caracteres do DOI.

---
 article/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/article/models.py b/article/models.py
index fe4b7ea6..f42ced24 100644
--- a/article/models.py
+++ b/article/models.py
@@ -734,10 +734,10 @@ class SourceArticle(models.Model):
         _("Specific Id"), max_length=255, null=False, blank=False
     )
     year = models.CharField(_("Year"), max_length=10, null=True, blank=True)
+    doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False)
     is_paratext = models.BooleanField(
         _("Paratext"), default=False, null=True, blank=True
     )
-    doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False)
     updated = models.CharField(
         _("Source updated date"), max_length=50, null=True, blank=False
     )