Skip to content

Commit

Permalink
Merge pull request #29 from f-hafner/flavio/issue-27
Browse files Browse the repository at this point in the history
Flavio/issue 27
  • Loading branch information
chrished authored Dec 15, 2022
2 parents b255b76 + 78bc95d commit 0adb2a9
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 32 deletions.
133 changes: 107 additions & 26 deletions src/dataprep/main/prep_mag/affiliation_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,138 @@
"""
Script affiliation_outcomes.py
Generate tables:
- affiliation_outcomes: some features at the affiliation level.
- number of journal articles published per affiliation. In contrast to table Affiliations,
consider only keep_doctypes
Generate tables: at the affiliation-year-field0 level
- affiliation_outcomes: publication outcomes
- number of journal articles published and 10-year forward citations per
affiliation-year-field0. Field0 is assigned from the Field0 of the published paper
- affiliation_fields: keywords of published papers
- fields of study of the published paper
NOTE: in the long run, we may consider to move this to prep_affiliations.py, or unify them in a new file.
"""

# TODO
# add some stats on the "concentration" per author-year?
# most likely much more concentrated than fields, so not doing for now

import argparse
import sqlite3 as sqlite
import warnings
import time
from helpers.functions import analyze_db
from helpers.variables import db_file, insert_questionmark_doctypes, keep_doctypes
from helpers.variables import db_file


# ## Arguments
parser = argparse.ArgumentParser()
parser.add_argument("--fos_max_level", type=int, default=2,
help="Fields of study up to which level to include?")
args = parser.parse_args()

# ## Variables; connect to db
start_time = time.time()
print(f"Start time: {start_time} \n")
interactive = False # Turn this on for only querying few records, ie for testing


con = sqlite.connect(database = db_file, isolation_level= None)

print("Making affiliation_outcomes table ...\n")
print("Creating temp table paper_affiliation_year")

query_limit = ""
if interactive:
query_limit = "LIMIT 1000"


con.execute("DROP TABLE IF EXISTS affiliation_outcomes")
con.execute(f"""CREATE TABLE affiliation_outcomes AS
SELECT AffiliationId, COUNT(DISTINCT PaperId) AS PublicationCount
from PaperAuthorAffiliations
with con as c:
c.execute(f"""
CREATE TEMP TABLE paper_affiliation_year AS
SELECT DISTINCT AffiliationId, Year, PaperId
FROM (
SELECT a.AuthorId, a.AffiliationId, a.Year, b.Paperid
FROM AuthorAffiliation a -- ## if an author has 2 main affiliations in the same year, we count their papers at both institutions
INNER JOIN (
SELECT PaperId, AuthorId, Year
FROM PaperAuthorUnique
INNER JOIN (
SELECT PaperId, Year
FROM Papers
) USING(PaperId)
{query_limit}
) b
ON a.AuthorId=b.AuthorId AND a.Year=b.Year
-- reduces size of the data set
INNER JOIN (
SELECT PaperId
FROM Papers
WHERE DocType IN ({insert_questionmark_doctypes})
AND Year >= 1950
FROM paper_outcomes
) USING(PaperId)
)
""")

c.execute("CREATE INDEX idx_paper_temp ON paper_affiliation_year (PaperId)")


print("Creating table affiliation_outcomes")

with con as c:
c.execute("DROP TABLE IF EXISTS affiliation_outcomes")

c.execute("""
CREATE TABLE affiliation_outcomes AS
SELECT AffiliationId
, Year
, Field0
, COUNT(PaperId) AS PaperCount
, SUM(CitationCount_y10) AS CitationCount_y10
FROM paper_affiliation_year
INNER JOIN (
SELECT PaperId, CitationCount_y10
FROM paper_outcomes
) USING(PaperId)
INNER JOIN (
SELECT PaperId, Field0
FROM PaperMainFieldsOfStudy
)
USING(PaperId)
GROUP BY AffiliationId, Year, Field0
""")

c.execute("CREATE UNIQUE INDEX idx_affo_AffilYearField ON affiliation_outcomes (AffiliationId, Year, Field0)")

print("Creating table affiliation_fields ")

with con as c:
c.execute("DROP TABLE IF EXISTS affiliation_fields")

c.execute(f"""
CREATE TABLE affiliation_fields AS
SELECT AffiliationId
, Field0
, Year
, FieldOfStudyId
, SUM(Score) AS Score
FROM paper_affiliation_year
INNER JOIN (
SELECT PaperId, FieldOfStudyId, Score
FROM PaperFieldsOfStudy
INNER JOIN (
SELECT AffiliationId
FROM Affiliations
) USING (AffiliationId)
GROUP BY (AffiliationId)
""",
(keep_doctypes)
)
con.execute("CREATE UNIQUE INDEX idx_affo_AffiliationId ON affiliation_outcomes (AffiliationId ASC)")
SELECT FieldOfStudyId
FROM FieldsOfStudy
WHERE level <= {args.fos_max_level}
) USING(FieldOfStudyId)
) USING(PaperId)
INNER JOIN (
SELECT PaperId, Field0
FROM PaperMainFieldsOfStudy
) USING(PaperId)
GROUP BY AffiliationId, FieldOfStudyId, Year, Field0
""")

c.execute("CREATE UNIQUE INDEX idx_afff_AffilFieldYearField ON affiliation_fields (AffiliationId, FieldOfStudyId, Year, Field0)")
c.execute("CREATE INDEX idx_afff_Year ON affiliation_fields (Year)")
c.execute("CREATE INDEX idx_afff_FoS ON affiliation_fields (FieldOfStudyId)")



# ## Run ANALYZE, finish
analyze_db(con)
with con as c:
analyze_db(c)


con.close()

Expand Down
6 changes: 4 additions & 2 deletions src/dataprep/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ python3 -m $script_path.prep_mag.read_collab &> $logfile_path/read_collab.log

python3 $script_path/prep_mag/prep_affiliations.py &> $logfile_path/prep_affiliations.log

python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end

python3 $script_path/prep_mag/prep_citations.py &> $logfile_path/prep_citations.log

python3 $script_path/prep_mag/paper_outcomes.py &> $logfile_path/paper_outcomes.log
Expand All @@ -50,6 +48,10 @@ python3 $script_path/prep_mag/author_info_linking.py --years_first_field 7 \
python -m $script_path.prep_mag.author_field0 \
&> $logfile_path/author_field0.log

python3 -m $script_path.prep_mag.affiliation_outcomes --fos_max_level 2 \
&> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end


# ## Consolidate gender per author in author_sample
python3 $script_path/prep_mag/author_gender.py &> $logfile_path/author_gender.log

Expand Down
9 changes: 5 additions & 4 deletions src/dataprep/temp/affiliation_outcomes.log
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
Start time: 1655815997.1026988

Making affiliation_outcomes table ...
Start time: 1670939995.9846091

Creating temp table paper_affiliation_year
Creating table affiliation_outcomes
Creating table affiliation_fields
Running ANALYZE...

Done in 81.3799677491188 minutes.
Done in 89.05228799978892 minutes.

0 comments on commit 0adb2a9

Please sign in to comment.