From 05d869b0809b1694241835707a9705eeca96c1ea Mon Sep 17 00:00:00 2001 From: Aga Date: Mon, 4 Dec 2023 11:43:54 +0000 Subject: [PATCH 1/3] update the script --- .../scripts/231127_withdrawn_journals.py | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/portality/scripts/231127_withdrawn_journals.py b/portality/scripts/231127_withdrawn_journals.py index 62f73bb678..99c6a2f251 100644 --- a/portality/scripts/231127_withdrawn_journals.py +++ b/portality/scripts/231127_withdrawn_journals.py @@ -12,6 +12,16 @@ } } +IN_DOAJ = { + "query" : { + "bool" : { + "must" : [ + {"term" : {"admin.in_doaj" : True}} + ] + } + } +} + if __name__ == "__main__": import argparse @@ -19,12 +29,12 @@ parser.add_argument("-o", "--out", help="output file path") args = parser.parse_args() - if not args.out: - print("Please specify an output file path with the -o option") - parser.print_help() - exit() + # if not args.out: + # print("Please specify an output file path with the -o option") + # parser.print_help() + # exit() - with open(args.out, "w", encoding="utf-8") as f: + with open("out.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["ID", "Journal Name", @@ -33,15 +43,23 @@ "P-ISSN" ]) - for journal in Journal.iterate(q=NOT_IN_DOAJ, keepalive='5m', wrap=True): + in_doaj_issns = [] + for journal in Journal.iterate(q=IN_DOAJ, keepalive='5m', wrap=True): bibjson = journal.bibjson() + in_doaj_issns.append({bibjson.get_one_identifier(bibjson.E_ISSN), bibjson.get_one_identifier(bibjson.P_ISSN)}) - writer.writerow([journal.id, - bibjson.title, - bibjson.get_single_url(urltype="homepage"), - bibjson.get_one_identifier(bibjson.E_ISSN), - bibjson.get_one_identifier(bibjson.P_ISSN), - ]) + + for journal in Journal.iterate(q=NOT_IN_DOAJ, keepalive='5m', wrap=True): + bibjson = journal.bibjson() + eissn = bibjson.get_one_identifier(bibjson.E_ISSN) + pissn = bibjson.get_one_identifier(bibjson.P_ISSN) + if ({eissn, pissn} not in in_doaj_issns): + writer.writerow([journal.id, + bibjson.title, + bibjson.get_single_url(urltype="homepage"), + eissn, + pissn, + ]) From e124bc7afa7e43cbadd4a30137eee60b6b41399b Mon Sep 17 00:00:00 2001 From: Aga Date: Mon, 4 Dec 2023 11:47:14 +0000 Subject: [PATCH 2/3] dev changes --- portality/scripts/231127_withdrawn_journals.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/portality/scripts/231127_withdrawn_journals.py b/portality/scripts/231127_withdrawn_journals.py index 99c6a2f251..c9445bc46e 100644 --- a/portality/scripts/231127_withdrawn_journals.py +++ b/portality/scripts/231127_withdrawn_journals.py @@ -29,12 +29,12 @@ parser.add_argument("-o", "--out", help="output file path") args = parser.parse_args() - # if not args.out: - # print("Please specify an output file path with the -o option") - # parser.print_help() - # exit() + if not args.out: + print("Please specify an output file path with the -o option") + parser.print_help() + exit() - with open("out.csv", "w", encoding="utf-8") as f: + with open(args.out, "w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["ID", "Journal Name", From 486369da278f657f8075188a668140a3094c214c Mon Sep 17 00:00:00 2001 From: Aga Date: Mon, 4 Dec 2023 12:25:23 +0000 Subject: [PATCH 3/3] update script to check whether ANY of issns is in doaj, rather than exact check --- portality/scripts/231127_withdrawn_journals.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/portality/scripts/231127_withdrawn_journals.py b/portality/scripts/231127_withdrawn_journals.py index c9445bc46e..1eddf809de 100644 --- a/portality/scripts/231127_withdrawn_journals.py +++ b/portality/scripts/231127_withdrawn_journals.py @@ -43,17 +43,18 @@ "P-ISSN" ]) - in_doaj_issns = [] + in_doaj_issns = set() for journal in Journal.iterate(q=IN_DOAJ, keepalive='5m', wrap=True): bibjson = journal.bibjson() - in_doaj_issns.append({bibjson.get_one_identifier(bibjson.E_ISSN), bibjson.get_one_identifier(bibjson.P_ISSN)}) + in_doaj_issns.add(bibjson.get_one_identifier(bibjson.E_ISSN)) + in_doaj_issns.add(bibjson.get_one_identifier(bibjson.P_ISSN)) for journal in Journal.iterate(q=NOT_IN_DOAJ, keepalive='5m', wrap=True): bibjson = journal.bibjson() eissn = bibjson.get_one_identifier(bibjson.E_ISSN) pissn = bibjson.get_one_identifier(bibjson.P_ISSN) - if ({eissn, pissn} not in in_doaj_issns): + if (eissn not in in_doaj_issns and pissn not in in_doaj_issns): writer.writerow([journal.id, bibjson.title, bibjson.get_single_url(urltype="homepage"),