Skip to content

Commit

Permalink
Merge pull request #228 from gitnnolabs/tk199-refactory
Browse files Browse the repository at this point in the history
WIP: Adiciona a preparação dos dado do sucupira e carga para os dados do sucupira.
  • Loading branch information
gitnnolabs authored Jul 25, 2023
2 parents 3db437e + ae58714 commit 701296a
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 21 deletions.
18 changes: 18 additions & 0 deletions article/migrations/0014_alter_sourcearticle_doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.6 on 2023-07-25 17:48

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("article", "0013_alter_article_sources_and_more"),
]

operations = [
migrations.AlterField(
model_name="sourcearticle",
name="doi",
field=models.CharField(max_length=255, null=True, verbose_name="DOI"),
),
]
8 changes: 4 additions & 4 deletions article/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,10 +734,10 @@ class SourceArticle(models.Model):
_("Specific Id"), max_length=255, null=False, blank=False
)
year = models.CharField(_("Year"), max_length=10, null=True, blank=True)
doi = models.CharField(_("DOI"), max_length=255, null=True, blank=False)
is_paratext = models.BooleanField(
_("Paratext"), default=False, null=True, blank=True
)
doi = models.CharField(_("DOI"), max_length=100, null=True, blank=False)
updated = models.CharField(
_("Source updated date"), max_length=50, null=True, blank=False
)
Expand Down Expand Up @@ -813,13 +813,13 @@ def get(cls, **kwargs):

filters = {}

if not kwargs.get("doi") and not kwargs.get("specific_id"):
if not kwargs.get("doi") and not kwargs.get("specific_id") and not kwargs.get("source"):
raise ValueError("Param doi or specific_id is required")

if kwargs.get("doi"):
filters = {"doi": kwargs.get("doi")}
filters = {"doi": kwargs.get("doi"), 'source': kwargs.get("source")}
elif kwargs.get("specific_id"):
filters = {"specific_id": kwargs.get("specific_id")}
filters = {"specific_id": kwargs.get("specific_id"), 'source': kwargs.get("source")}

return cls.objects.get(**filters)

Expand Down
25 changes: 25 additions & 0 deletions article/scripts/concat_sucupira.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
from django.utils.translation import gettext as _

from article.tasks import concat_article_sucupira_detail, concat_author_sucupira


def run(production_file_csv, detail_file_csv, authors=None, sync=0, file_name="sucupira_article.csv"):
"""
Concate the a file with the article production in CAPES
"""
sync = bool(int(sync))
authors = authors.split(",")

if production_file_csv and detail_file_csv:
if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv):
if sync:
df = concat_article_sucupira_detail(production_file_csv, detail_file_csv)

if authors:
ddfau = concat_author_sucupira(df, authors)
ddfau.to_csv(file_name, index=False)
else:
concat_article_sucupira_detail.apply_async(args=(production_file_csv, detail_file_csv, True))
else:
print(_("It looks like the given path is not a file!"))
21 changes: 21 additions & 0 deletions article/scripts/load_sucupira.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
from django.utils.translation import gettext as _

from article.tasks import load_sucupira


def run(production_file_csv, detail_file_csv, authors=None, sync=0):
"""
Load the sucupira data to article.models.SourceArticle
"""
sync = bool(int(sync))
authors = authors.split(",")

if production_file_csv and detail_file_csv:
if os.path.isfile(production_file_csv) and os.path.isfile(detail_file_csv):
if sync:
load_sucupira(production_file_csv, detail_file_csv, authors)
else:
load_sucupira.apply_async(args=(production_file_csv, detail_file_csv, authors))
else:
print(_("It looks like the given path is not a file!"))
184 changes: 169 additions & 15 deletions article/tasks.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging

import pandas as pd
from django.conf import settings
from django.contrib.auth import get_user_model
from django.utils.translation import gettext as _

from article import models
from core.models import Source
from config import celery_app
from core.models import Source
from core.utils import utils as core_utils

logger = logging.getLogger(__name__)
Expand All @@ -33,7 +34,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
tasks.load_openalex(date=2012)
Running using a script:
Running using a script:
python manage.py runscript load_openalex --script-args 1 2012
Expand Down Expand Up @@ -148,7 +149,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
"Department of Exercise Epidemiology, Centre for Research in Childhood Health, University of Southern Denmark, Odense, Denmark"
]
}
}
}
"""
url = (
settings.URL_API_OPENALEX
Expand All @@ -158,7 +159,7 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
_source, _ = Source.objects.get_or_create(name="OPENALEX")

try:
flag = True
flag = True
article_count = 0

while flag:
Expand All @@ -176,16 +177,18 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
article["source"] = _source
article["raw"] = item

article, is_created = models.SourceArticle.create_or_update(**article)
article, created = models.SourceArticle.create_or_update(
**article
)

logger.info(
"%s: %s"
% (
"Created article" if is_created else "Updated article",
"Created article" if created else "Updated article",
article,
)
)
article_count += 1
article_count += 1

cursor = payload["meta"]["next_cursor"]

Expand Down Expand Up @@ -219,7 +222,7 @@ def contributors(authors):
"""
This function generate a list os contributors list.
This function get the key ``authorships`` from with this struture:
This function get the key ``authorships`` from with this struture:
"authorships":[
{
Expand Down Expand Up @@ -303,18 +306,19 @@ def contributors(authors):
affs = []

for aff in au.get("raw_affiliation_strings"):

aff_obj, _ = models.Affiliation.create_or_update(
**{"name": aff}
)
affs.append(aff_obj)

author_dict.update(
{'affiliations': affs, 'affiliations_string': au.get("raw_affiliation_string")})
{
"affiliations": affs,
"affiliations_string": au.get("raw_affiliation_string"),
}
)

contributor, _ = models.Contributor.create_or_update(
**author_dict
)
contributor, _ = models.Contributor.create_or_update(**author_dict)

contributors.append(contributor)

Expand All @@ -323,7 +327,6 @@ def contributors(authors):
# read SourceArticle
for article in models.SourceArticle.objects.filter(source__name="OPENALEX"):
try:

doi = article.doi
# title
title = core_utils.nestget(article.raw, "title")
Expand All @@ -346,7 +349,9 @@ def contributors(authors):

# Get the journal data
if article.raw.get("primary_location"):
journal_data = core_utils.nestget(article.raw, "primary_location", "source")
journal_data = core_utils.nestget(
article.raw, "primary_location", "source"
)
if journal_data:
j_issn_l = journal_data.get("issn_l")
if journal_data.get("issn"):
Expand Down Expand Up @@ -408,3 +413,152 @@ def contributors(authors):
logger.info("Article: %s, %s" % (article, created))
except Exception as e:
logger.error("Erro on save article: %s" % e)


@celery_app.task(
name="Concatenates the Sucupira intellectual production with the details of the production"
)
def concat_article_sucupira_detail(production_file_csv, detail_file_csv, json=False):
"""
This task concate the a file with the article production in CAPES.
The source of the production_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
The columns of the production_file_csv:
['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_ADD_PRODUCAO_INTELECTUAL',
'ID_PRODUCAO_INTELECTUAL', 'NM_PRODUCAO', 'ID_TIPO_PRODUCAO',
'NM_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'NM_SUBTIPO_PRODUCAO',
'ID_FORMULARIO_PRODUCAO', 'NM_FORMULARIO', 'ID_AREA_CONCENTRACAO',
'NM_AREA_CONCENTRACAO', 'ID_LINHA_PESQUISA', 'NM_LINHA_PESQUISA',
'ID_PROJETO', 'NM_PROJETO', 'DH_INICIO_AREA_CONC', 'DH_FIM_AREA_CONC',
'DH_INICIO_LINHA', 'DH_FIM_LINHA', 'IN_GLOSA',
'IN_PRODUCAO_COM_VINCULO_TCC', 'ID_ADD_TRABALHO_CONCLUSAO_CT'],
The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf
The source of the detail_file_csv: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-detalhes-da-producao-intelectual-bibliografica-de-programas-de-pos-graduacao
The columns of the detail_file_csv:
['CD_PROGRAMA_IES', 'NM_PROGRAMA_IES', 'SG_ENTIDADE_ENSINO',
'NM_ENTIDADE_ENSINO', 'AN_BASE_PRODUCAO', 'ID_ADD_PRODUCAO_INTELECTUAL',
'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO', 'DS_NATUREZA', 'NR_VOLUME',
'DS_FASCICULO', 'NR_SERIE', 'NR_PAGINA_FINAL', 'NR_PAGINA_INICIAL',
'DS_IDIOMA', 'DS_DIVULGACAO', 'DS_URL', 'DS_OBSERVACOES', 'NM_EDITORA',
'NM_CIDADE', 'DS_DOI', 'DS_ISSN', 'ID_VALOR_LISTA', 'DS_URL_DOI',
'IN_GLOSA']
The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/8498a5f7-de52-4fb9-8c62-b827cb27bcf9/resource/c6064162-3e13-4b71-ac47-114f83771002/download/metadados_detalhes_producao_intelectual_bibliografica_2017a2020.pdf
"""
df = pd.read_csv(production_file_csv, encoding="iso-8859-1", delimiter=";")

ddf = pd.read_csv(
detail_file_csv, encoding="iso-8859-1", delimiter=";", low_memory=False
)

# Cria lista de colunas e preserva coluna ID_ADD_PRODUCAO_INTELECTUAL
diff_cols = ["ID_ADD_PRODUCAO_INTELECTUAL"]

# Encontre as colunas que não estão no primeiro DataFrame e extenda a lista de colunas
diff_cols.extend(list(ddf.columns.difference(df.columns)))

# Recrie o DDF com somente as colunas diferentes
ddf2 = ddf[diff_cols]

# Aplica o Merge dos 2 DFs
dfj = pd.merge(df, ddf2, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left")

logger.info("Total of lines concatenates: %s" % str(dfj.shape))
logger.info("Columns: %s" % set(dfj.columns))

return dfj.to_json() if json else dfj


@celery_app.task(name="Concatenates the author with the details of the production")
def concat_author_sucupira(djf, author_files, json=False):
"""
This task concate the author files of sucupira with the result of ``concat_article_sucupira_detail`` task.
The djf is a dataframe with the columns:
{'DS_OBSERVACOES', 'NM_PROGRAMA_IES', 'ID_PRODUCAO_INTELECTUAL', 'NR_SERIE', 'DS_FASCICULO', 'ID_ADD_TRABALHO_CONCLUSAO_CT', 'DS_URL_DOI', 'DH_INICIO_LINHA', 'ID_ADD_PRODUCAO_INTELECTUAL', 'NM_CIDADE', 'ID_AREA_CONCENTRACAO', 'DS_DIVULGACAO', 'DS_IDIOMA', 'NM_ENTIDADE_ENSINO', 'AN_BASE', 'ID_LINHA_PESQUISA', 'ID_VALOR_LISTA', 'NM_TIPO_PRODUCAO', 'NM_AREA_CONCENTRACAO', 'ID_PROJETO', 'CD_PROGRAMA_IES', 'ID_FORMULARIO_PRODUCAO', 'DH_INICIO_AREA_CONC', 'DS_NATUREZA', 'NM_FORMULARIO', 'SG_ENTIDADE_ENSINO', 'NR_PAGINA_FINAL', 'NM_SUBTIPO_PRODUCAO', 'ID_TIPO_PRODUCAO', 'NR_VOLUME', 'NR_PAGINA_INICIAL', 'ID_SUBTIPO_PRODUCAO', 'IN_GLOSA', 'AN_BASE_PRODUCAO', 'DS_DOI', 'NM_PRODUCAO', 'NM_PROJETO', 'DH_FIM_LINHA', 'DS_ISSN', 'IN_PRODUCAO_COM_VINCULO_TCC', 'DH_FIM_AREA_CONC', 'NM_EDITORA', 'NM_LINHA_PESQUISA', 'DS_URL'}
The source of the author_files: https://dadosabertos.capes.gov.br/dataset/2017-a-2020-autor-da-producao-intelectual-de-programas-de-pos-graduacao-stricto-sensu
The columns of the production_file_csv:
['AN_BASE', 'ID_TIPO_PRODUCAO', 'ID_SUBTIPO_PRODUCAO',
'QT_ANO_EGRESSO_M', 'QT_ANO_EGRESSO_F', 'QT_ANO_EGRESSO_D',
'QT_ANO_EGRESSO_R', 'CD_PROGRAMA_IES', 'NM_PROGRAMA_IES',
'SG_ENTIDADE_ENSINO', 'NM_ENTIDADE_ENSINO',
'ID_ADD_PRODUCAO_INTELECTUAL', 'NR_ORDEM', 'ID_PESSOA_DISCENTE',
'ID_PESSOA_DOCENTE', 'ID_PARTICIPANTE_PPG_IES',
'ID_PESSOA_PART_EXTERNO', 'ID_PESSOA_POS_DOC', 'ID_PESSOA_EGRESSO',
'NM_AUTOR', 'TP_AUTOR', 'NM_TP_CATEGORIA_DOCENTE', 'NM_NIVEL_DISCENTE',
'NM_ABNT_AUTOR', 'CD_AREA_CONHECIMENTO', 'NM_AREA_CONHECIMENTO',
'ID_NATUREZA_ATUACAO', 'NM_NATUREZA_ATUACAO', 'ID_PAIS', 'NM_PAIS',
'IN_GLOSA']
The dictionary of the data is in this file: https://dadosabertos.capes.gov.br/dataset/de69242b-03b0-4d38-b5b2-a9169abd84c2/resource/40b83217-dc80-4d30-8db1-4ee91dea3ecc/download/metadados_autor_producao_intelectual_2017_2020.pdf
"""
dfas = pd.DataFrame()

for file in author_files:
data = pd.read_csv(file, encoding="iso-8859-1", delimiter=";")
dfas = pd.concat([dfas, data], axis=0)

dfgrupa = pd.DataFrame(
dfas.groupby(["ID_ADD_PRODUCAO_INTELECTUAL"])
.apply(
lambda x: x[
["NM_AUTOR", "NM_PROGRAMA_IES", "SG_ENTIDADE_ENSINO", "NM_ABNT_AUTOR"]
].to_dict(orient="records")
)
.rename("DICT_AUTORES")
).reset_index()

djau = pd.merge(djf, dfgrupa, on="ID_ADD_PRODUCAO_INTELECTUAL", how="left")

logger.info("Total of authors lines concatenates: %s" % str(djau.shape))
logger.info("Columns: %s" % set(djau.columns))

return djau.to_json() if json else djau


@celery_app.task(name="Load Sucupira data to SourceArticle")
def load_sucupira(production_file_csv, detail_file_csv, authors):
"""
This task read the sucupira_file and add the article to ``article.models.SourceArticle``
"""

dfau = concat_author_sucupira(
concat_article_sucupira_detail(production_file_csv, detail_file_csv), authors
)

_source, _ = Source.objects.get_or_create(name="SUCUPIRA")

for index, row in dfau.iterrows():
doi = "" if str(row["DS_DOI"]) == "nan" else row["DS_DOI"]

# Try to fill the doi by DS_URL_DOI
if not doi:
doi = "" if str(row["DS_URL_DOI"]) == "nan" else row["DS_URL_DOI"]

specific_id = str(row["ID_ADD_PRODUCAO_INTELECTUAL"])

article_source_dict = {
"doi": doi,
"specific_id": specific_id,
"year": row["AN_BASE_PRODUCAO"],
"source": _source,
"raw": row.to_json()
}

article, created = models.SourceArticle.create_or_update(
**article_source_dict
)

logger.info(
"####%s####, %s, %s"
% (index.numerator, article.doi or article.specific_id, created)
)
2 changes: 1 addition & 1 deletion institution/scripts/load_institution.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ def run(user_id, length=None, country=None):
elif user_id and length:
load_institution.apply_async(args=(int(user_id), int(length)))
elif user_id:
load_institution.apply_async(args=(int(user_id)))
load_institution.apply_async(args=(int(user_id), ))
else:
print(_("Param user_id required."))
6 changes: 5 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,8 @@ tenacity==8.2.2 # https://pypi.org/project/tenacity/

# DRF - Yet another Swagger generator 2
# ------------------------------------------------------------------------------
drf-yasg==1.21.5 # https://pypi.org/project/drf-yasg2/
drf-yasg==1.21.5 # https://pypi.org/project/drf-yasg2/

# Pandas
# ------------------------------------------------------------------------------
pandas==2.0.1 # https://pandas.pydata.org/

0 comments on commit 701296a

Please sign in to comment.