Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix arxiv adjuster #68

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 85 additions & 69 deletions background/preformatters/arxiv.js
Original file line number Diff line number Diff line change
@@ -1,73 +1,89 @@
var BINPreformatter = ( function () {
// a shadow as a "promise not to touch global data and variables". Must be included to be accepted!
var BINData = null;
var BINInteraction = null;
var BINParser = null;
var window = null;
var document = null;

// a shadow as a "promise not to touch global data and variables". Must be included to be accepted!
var BINData = null;
var BINInteraction = null;
var BINParser = null;
var window = null;
var document = null;

//preformatting function
function preformatData(metaData, parser) {

//fix beginning of abstract
let abstract = metaData["citation_abstract"].replace(/^Abstract:[\ ]+/,"");

//fix math in abstract, math symbols saved in citation_misc
let mathSymbols = metaData["citation_misc"];

if (mathSymbols != "" && (mathSymbols = mathSymbols.split(/[\ ]+;[\ ]+/)) != null) {
const length = mathSymbols.length;
if (length%2 == 0) {
//index variable
let idx = 0;
for (let i = 0; i<length; ++i) {

//get match and math symbol from misc
let match = mathSymbols[i].trim();
i++;
let symbol = mathSymbols[i].trim();
match += symbol;

//continue only if not empty string
if (symbol != "") {

//search for match in abstract text
let nextIdx = abstract.indexOf(match,idx);

//if found, replace by math
if (nextIdx != -1) {

//get new index in abstract after match
idx = nextIdx + match.length;

//replace string in abstract
abstract = abstract.slice(0,nextIdx) + "$" + symbol + "$" + abstract.slice(idx);

//get index in NEW abstract where to start searching from!
idx = nextIdx + symbol.length+3;
}
}
}
}
}

//reassign abstract
metaData["citation_abstract"] = abstract;

//clear misc
metaData["citation_misc"] = "";

//manually set journal
metaData["citation_journal_title"] = "ArXiv e-prints";
metaData["citation_journal_abbrev"] = "arXiv";
metaData["citation_database"] = "ArXiv e-prints";

//preformat url
metaData["citation_url"] = metaData["citation_url"].replace("/pdf/","/abs/");
}

// expose preformatting function and raw preformatting function
return { preformatData : preformatData };
//preformatting function
function preformatData(metaData, parser) {

function fixMath(content, sym) {
var mathSymbols=sym.split(/[\ ]+;[\ ]+/);
if (sym != "" && mathSymbols != null ){
const length = mathSymbols.length;
if (length%2 == 0) {
//index variable
let idx = 0;
for (let i = 0; i<length; ++i) {

//get match and math symbol from misc
let match = mathSymbols[i].trim();
i++;
let symbol = mathSymbols[i].trim();
match += symbol;

//continue only if not empty string
if (symbol != "") {

let nextIdx = content.indexOf(match,idx);

//if found, replace by math
if (nextIdx != -1) {

//get new index in abstract after match
idx = nextIdx + match.length;

//replace string in abstract
content = content.slice(0,nextIdx) + "$" + symbol + "$" + content.slice(idx);

//get index in NEW abstract where to start searching from!
idx = nextIdx + symbol.length+3;
}
}
}
}
}
return content;
}

//fix beginning of title and abstract
metaData["citation_abstract"] = metaData["citation_abstract"].replace(/^Abstract:[\ ]*/,"");
metaData["citation_title"] = metaData["citation_title"].replace(/^Title:/,"");


// ensures that this string separates the title from the abstract
let separator = "A42h4";
separator = separator.repeat(metaData["citation_abstract"].length + metaData["citation_title"].length);

//fix math in title and abstract, math symbols saved in citation_misc
let proc = metaData["citation_title"] + separator + metaData["citation_abstract"];
proc = fixMath(proc, metaData["citation_misc"]);


//reassign title and abstract
let p = proc.split(separator);
if (p != null && p.length == 2) {
metaData["citation_title"] = p[0];
metaData["citation_abstract"] = p[1];
}

//clear misc
metaData["citation_misc"] = "";

//manually set journal
metaData["citation_journal_title"] = "ArXiv e-prints";
metaData["citation_journal_abbrev"] = "arXiv";
metaData["citation_database"] = "ArXiv e-prints";



//preformat url
metaData["citation_url"] = metaData["citation_url"].replace("/pdf/","/abs/");
}

// expose preformatting function and raw preformatting function
return { preformatData : preformatData };

}());
11 changes: 9 additions & 2 deletions extractors/prefselectors/arxiv.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,16 @@ var BINPrefselector = ( function () {

// these are the preferred selectors used, and may be modified. The format is "bibfield: [ [css-selector,attribute], ...],", where "attribute" can be any html tag attribute or "innerText" to get the text between <tag> and </tag>
var prefselectorMsg = {
citation_title: [ [ 'h1.title','textContent', true, 20000] ],
citation_title_nonlatex: [ [ 'meta[property="og:title"]','content'] ],
citation_abstract: [ [ 'blockquote.abstract','textContent', true, 20000] ],
citation_misc: [ ['blockquote.abstract script[type="math/tex"], blockquote.abstract span.MathJax','textContent',true, 1024, true, 1000] ], /*for mathjax detection*/
citation_keywords: [ [ 'td.tablecell.subjects','innerText'] ],

/*for mathjax detection*/
citation_misc: [ ['h1.title script[type="math/tex"], h1.title span.MathJax, blockquote.abstract script[type="math/tex"], blockquote.abstract span.MathJax','textContent',true, 1024, true, 1000] ],



citation_keywords: [ [ 'td.tablecell.subjects','innerText'] ],
citation_url: [ ['meta[property="og:url" i]','content'] ],
citation_doi: [ ['meta[name="citation_doi"]','content'] , ['td.doi a','href'] , ['td.arxivdoi a','href'] ]
};
Expand Down