Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flavio/issue 27 #29

Merged
merged 2 commits into from
Dec 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 107 additions & 26 deletions src/dataprep/main/prep_mag/affiliation_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,138 @@
"""
Script affiliation_outcomes.py

Generate tables:
- affiliation_outcomes: some features at the affiliation level.
- number of journal articles published per affiliation. In contrast to table Affiliations,
consider only keep_doctypes
Generate tables: at the affiliation-year-field0 level
- affiliation_outcomes: publication outcomes
- number of journal articles published and 10-year forward citations per
affiliation-year-field0. Field0 is assigned from the Field0 of the published paper
- affiliation_fields: keywords of published papers
- fields of study of the published paper
NOTE: in the long run, we may consider to move this to prep_affiliations.py, or unify them in a new file.
"""

# TODO
# add some stats on the "concentration" per author-year?
# most likely much more concentrated than fields, so not doing for now

import argparse
import sqlite3 as sqlite
import warnings
import time
from helpers.functions import analyze_db
from helpers.variables import db_file, insert_questionmark_doctypes, keep_doctypes
from helpers.variables import db_file


# ## Arguments
parser = argparse.ArgumentParser()
parser.add_argument("--fos_max_level", type=int, default=2,
help="Fields of study up to which level to include?")
args = parser.parse_args()

# ## Variables; connect to db
start_time = time.time()
print(f"Start time: {start_time} \n")
interactive = False # Turn this on for only querying few records, ie for testing


con = sqlite.connect(database = db_file, isolation_level= None)

print("Making affiliation_outcomes table ...\n")
print("Creating temp table paper_affiliation_year")

query_limit = ""
if interactive:
query_limit = "LIMIT 1000"


con.execute("DROP TABLE IF EXISTS affiliation_outcomes")
con.execute(f"""CREATE TABLE affiliation_outcomes AS
SELECT AffiliationId, COUNT(DISTINCT PaperId) AS PublicationCount
from PaperAuthorAffiliations
with con as c:
c.execute(f"""
CREATE TEMP TABLE paper_affiliation_year AS
SELECT DISTINCT AffiliationId, Year, PaperId
FROM (
SELECT a.AuthorId, a.AffiliationId, a.Year, b.Paperid
FROM AuthorAffiliation a -- ## if an author has 2 main affiliations in the same year, we count their papers at both institutions
INNER JOIN (
SELECT PaperId, AuthorId, Year
FROM PaperAuthorUnique
INNER JOIN (
SELECT PaperId, Year
FROM Papers
) USING(PaperId)
{query_limit}
) b
ON a.AuthorId=b.AuthorId AND a.Year=b.Year
-- reduces size of the data set
INNER JOIN (
SELECT PaperId
FROM Papers
WHERE DocType IN ({insert_questionmark_doctypes})
AND Year >= 1950
FROM paper_outcomes
) USING(PaperId)
)
""")

c.execute("CREATE INDEX idx_paper_temp ON paper_affiliation_year (PaperId)")


print("Creating table affiliation_outcomes")

with con as c:
c.execute("DROP TABLE IF EXISTS affiliation_outcomes")

c.execute("""
CREATE TABLE affiliation_outcomes AS
SELECT AffiliationId
, Year
, Field0
, COUNT(PaperId) AS PaperCount
, SUM(CitationCount_y10) AS CitationCount_y10
FROM paper_affiliation_year
INNER JOIN (
SELECT PaperId, CitationCount_y10
FROM paper_outcomes
) USING(PaperId)
INNER JOIN (
SELECT PaperId, Field0
FROM PaperMainFieldsOfStudy
)
USING(PaperId)
GROUP BY AffiliationId, Year, Field0
""")

c.execute("CREATE UNIQUE INDEX idx_affo_AffilYearField ON affiliation_outcomes (AffiliationId, Year, Field0)")

print("Creating table affiliation_fields ")

with con as c:
c.execute("DROP TABLE IF EXISTS affiliation_fields")

c.execute(f"""
CREATE TABLE affiliation_fields AS
SELECT AffiliationId
, Field0
, Year
, FieldOfStudyId
, SUM(Score) AS Score
FROM paper_affiliation_year
INNER JOIN (
SELECT PaperId, FieldOfStudyId, Score
FROM PaperFieldsOfStudy
INNER JOIN (
SELECT AffiliationId
FROM Affiliations
) USING (AffiliationId)
GROUP BY (AffiliationId)
""",
(keep_doctypes)
)
con.execute("CREATE UNIQUE INDEX idx_affo_AffiliationId ON affiliation_outcomes (AffiliationId ASC)")
SELECT FieldOfStudyId
FROM FieldsOfStudy
WHERE level <= {args.fos_max_level}
) USING(FieldOfStudyId)
) USING(PaperId)
INNER JOIN (
SELECT PaperId, Field0
FROM PaperMainFieldsOfStudy
) USING(PaperId)
GROUP BY AffiliationId, FieldOfStudyId, Year, Field0
""")

c.execute("CREATE UNIQUE INDEX idx_afff_AffilFieldYearField ON affiliation_fields (AffiliationId, FieldOfStudyId, Year, Field0)")
c.execute("CREATE INDEX idx_afff_Year ON affiliation_fields (Year)")
c.execute("CREATE INDEX idx_afff_FoS ON affiliation_fields (FieldOfStudyId)")



# ## Run ANALYZE, finish
analyze_db(con)
with con as c:
analyze_db(c)


con.close()

Expand Down
6 changes: 4 additions & 2 deletions src/dataprep/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ python3 -m $script_path.prep_mag.read_collab &> $logfile_path/read_collab.log

python3 $script_path/prep_mag/prep_affiliations.py &> $logfile_path/prep_affiliations.log

python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end

python3 $script_path/prep_mag/prep_citations.py &> $logfile_path/prep_citations.log

python3 $script_path/prep_mag/paper_outcomes.py &> $logfile_path/paper_outcomes.log
Expand All @@ -50,6 +48,10 @@ python3 $script_path/prep_mag/author_info_linking.py --years_first_field 7 \
python -m $script_path.prep_mag.author_field0 \
&> $logfile_path/author_field0.log

python3 -m $script_path.prep_mag.affiliation_outcomes --fos_max_level 2 \
&> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end


# ## Consolidate gender per author in author_sample
python3 $script_path/prep_mag/author_gender.py &> $logfile_path/author_gender.log

Expand Down
9 changes: 5 additions & 4 deletions src/dataprep/temp/affiliation_outcomes.log
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
Start time: 1655815997.1026988

Making affiliation_outcomes table ...
Start time: 1670939995.9846091

Creating temp table paper_affiliation_year
Creating table affiliation_outcomes
Creating table affiliation_fields
Running ANALYZE...

Done in 81.3799677491188 minutes.
Done in 89.05228799978892 minutes.