From 3ff8b39df71147c662ea51e7eafbae855278d71d Mon Sep 17 00:00:00 2001 From: Przemek Wiech Date: Mon, 7 Feb 2022 23:36:47 +0100 Subject: [PATCH] Updated Werelate scraper to use HTTPS --- README.md | 2 +- src/scrapers/werelate.js | 94 +++++++++++------------ test/data/werelate/output/mary-ball.json | 6 +- test/data/werelate/output/washington.json | 6 +- test/src/scrapers/werelate.js | 12 +-- 5 files changed, 60 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index e4b6dfe..29da73f 100644 --- a/README.md +++ b/README.md @@ -122,5 +122,5 @@ from the same record on the same website. * [Genealogie Online](https://www.genealogieonline.nl) * [MyHeritage](https://www.myheritage.com) historical records and tree profiles * [Open Archives](https://www.openarch.nl) -* [WeRelate](http://www.werelate.org/) +* [WeRelate](https://www.werelate.org/) * [WikiTree](http://www.wikitree.com/) person profiles diff --git a/src/scrapers/werelate.js b/src/scrapers/werelate.js index ba9cf92..16663d8 100644 --- a/src/scrapers/werelate.js +++ b/src/scrapers/werelate.js @@ -3,7 +3,7 @@ var debug = require('debug')('genscrape:scrapers:werela'), GedcomX = require('gedcomx-js'); var urls = [ - utils.urlPatternToRegex("http://www.werelate.org/wiki/Person:*") + utils.urlPatternToRegex("https://www.werelate.org/wiki/Person:*") ]; module.exports = function(register){ @@ -86,9 +86,9 @@ var factTypes = { }; function run(emitter){ - + debug('run'); - + var gedx = GedcomX(), primaryPerson = GedcomX.Person({ principal: true, @@ -97,13 +97,13 @@ function run(emitter){ 'genscrape': getRecordIdentifier(document.location.href) } }); - + gedx.addPerson(primaryPerson); - + // // Facts // - + // Gather the fact data var facts = []; Array.from(document.querySelectorAll('.wr-infotable-factsevents tr')).forEach(function(row){ @@ -115,21 +115,21 @@ function run(emitter){ }); } }); - + // Process the fact data facts.forEach(function(factInfo){ - + var row = factInfo.row, label = factInfo.label, dateCell = row.children[1], placeCell = row.children[2]; - + switch(label){ - + case 'Name': primaryPerson.addSimpleName(dateCell.textContent.trim()); break; - + case 'Gender': switch(dateCell.textContent.trim()){ case 'Male': @@ -144,7 +144,7 @@ function run(emitter){ break; } break; - + // Most facts will have a date and a place default: if(row.children.length === 3){ @@ -173,22 +173,22 @@ function run(emitter){ } } } - + }); - + // // Relationships // - + // Parents and siblings Array.from(document.querySelectorAll('.wr-infobox-parentssiblings')).forEach(function(family){ - + var parents = family.querySelector('ul'), children = family.querySelector('ol'), marriage = family.querySelector('.wr-infobox-event'), parent1Label = parentLabel(parents.children[0]), mother, father; - + if(parent1Label === 'F'){ father = processPerson(parents.children[0], 'http://gedcomx.org/Male'); mother = processPerson(parents.children[1], 'http://gedcomx.org/Female'); @@ -196,7 +196,7 @@ function run(emitter){ father = processPerson(parents.children[1], 'http://gedcomx.org/Male'); mother = processPerson(parents.children[0], 'http://gedcomx.org/Female'); } - + if(father){ gedx.addPerson(father); gedx.addRelationship({ @@ -205,7 +205,7 @@ function run(emitter){ person2: primaryPerson }); } - + if(mother){ gedx.addPerson(mother); gedx.addRelationship({ @@ -214,7 +214,7 @@ function run(emitter){ person2: primaryPerson }); } - + if(father && mother){ var couple = GedcomX.Relationship({ type: 'http://gedcomx.org/Couple', @@ -231,17 +231,17 @@ function run(emitter){ } gedx.addRelationship(couple); } - + Array.from(children.children).forEach(function(child){ - + // Skip the entry for the primary person if(child.querySelector('.selflink')){ return; } - + child = processPerson(child); gedx.addPerson(child); - + if(father){ gedx.addRelationship({ type: 'http://gedcomx.org/ParentChild', @@ -249,7 +249,7 @@ function run(emitter){ person2: child }); } - + if(mother){ gedx.addRelationship({ type: 'http://gedcomx.org/ParentChild', @@ -259,17 +259,17 @@ function run(emitter){ } }); }); - + // Spouses and children Array.from(document.querySelectorAll('.wr-infobox-spousechildren')).forEach(function(family){ - + var parents = family.querySelector('ul'), children = family.querySelector('ol'), marriage = family.querySelector('.wr-infobox-event'), parent1Label = parentLabel(parents.children[0]), parent2Label = parentLabel(parents.children[1]), spouse, spouseLabel, couple; - + // Determine whether the primary person is the husband or the wife if(parents.children[0].querySelector('.selflink')){ spouse = processPerson(parents.children[1]); @@ -278,19 +278,19 @@ function run(emitter){ spouse = processPerson(parents.children[0]); spouseLabel = parent1Label; } - + spouse.setGender({ type: spouseLabel === 'H' ? 'http://gedcomx.org/Male' : 'http://gedcomx.org/Female' }); - + gedx.addPerson(spouse); - + couple = GedcomX.Relationship({ type: 'http://gedcomx.org/Couple', person1: primaryPerson, person2: spouse }); - + if(marriage){ couple.addFact({ type: 'http://gedcomx.org/Marriage', @@ -299,21 +299,21 @@ function run(emitter){ } }); } - + gedx.addRelationship(couple); - + if(children){ Array.from(children.children).forEach(function(child){ - + child = processPerson(child); gedx.addPerson(child); - + gedx.addRelationship({ type: 'http://gedcomx.org/ParentChild', person1: primaryPerson, person2: child }); - + gedx.addRelationship({ type: 'http://gedcomx.org/ParentChild', person1: spouse, @@ -322,9 +322,9 @@ function run(emitter){ }); } }); - + // TODO: gather sources listed in the profile - + // Agent gedx.addAgent(GedcomX.Agent() .setId('agent') @@ -333,9 +333,9 @@ function run(emitter){ value: 'WeRelate' }) .setHomepage({ - resource: 'http://www.werelate.org' + resource: 'https://www.werelate.org' })); - + // SourceDescription gedx.addSourceDescriptionToAll({ about: document.location.href, @@ -346,7 +346,7 @@ function run(emitter){ ], citations: [ { - value: document.title + ', WeRelate.org (' + window.document.location.href + value: document.title + ', WeRelate.org (' + window.document.location.href + ' : accessed ' + utils.getDateString() + ')' } ], @@ -354,14 +354,14 @@ function run(emitter){ resource: '#agent' } }); - + emitter.emit('data', gedx); } /** * Get the label for a parent in a family box. For parents it is F (Father) or * M (Mother). For spouses it's H (Husband) or W (Wife). - * + * * @param {Element} parent Parent's li element * @return {String} label */ @@ -373,7 +373,7 @@ function parentLabel(parent){ /** * Create a GedcomX.Person from a person entry in a family box - * + * * @param {Element} element Person's li DOM element * @param {String} gender * @return {GedcomX.Person} @@ -417,7 +417,7 @@ function processPerson(element, gender){ /** * Get the record ID - * + * * @param {String} url * @return {String} */ @@ -427,7 +427,7 @@ function getRecordId(url) { /** * Get a record's identifier - * + * * @param {String} url * @return {String} */ diff --git a/test/data/werelate/output/mary-ball.json b/test/data/werelate/output/mary-ball.json index 5d636fa..5589cf2 100644 --- a/test/data/werelate/output/mary-ball.json +++ b/test/data/werelate/output/mary-ball.json @@ -730,10 +730,10 @@ "id": "1", "citations": [ { - "value": "Person:Mary Ball (5) - Genealogy, WeRelate.org (http://www.werelate.org/wiki/Person:Mary_Ball_%285%29 : accessed 17 April 2013)" + "value": "Person:Mary Ball (5) - Genealogy, WeRelate.org (https://www.werelate.org/wiki/Person:Mary_Ball_%285%29 : accessed 17 April 2013)" } ], - "about": "http://www.werelate.org/wiki/Person:Mary_Ball_%285%29", + "about": "https://www.werelate.org/wiki/Person:Mary_Ball_%285%29", "titles": [ { "value": "Person:Mary Ball (5) - Genealogy" @@ -754,7 +754,7 @@ } ], "homepage": { - "resource": "http://www.werelate.org" + "resource": "https://www.werelate.org" } } ] diff --git a/test/data/werelate/output/washington.json b/test/data/werelate/output/washington.json index 6b7e0a1..7c92500 100644 --- a/test/data/werelate/output/washington.json +++ b/test/data/werelate/output/washington.json @@ -713,10 +713,10 @@ "id": "1", "citations": [ { - "value": "Person:George Washington (6) - Genealogy, WeRelate.org (http://www.werelate.org/wiki/Person:George_Washington_%286%29 : accessed 17 April 2013)" + "value": "Person:George Washington (6) - Genealogy, WeRelate.org (https://www.werelate.org/wiki/Person:George_Washington_%286%29 : accessed 17 April 2013)" } ], - "about": "http://www.werelate.org/wiki/Person:George_Washington_%286%29", + "about": "https://www.werelate.org/wiki/Person:George_Washington_%286%29", "titles": [ { "value": "Person:George Washington (6) - Genealogy" @@ -737,7 +737,7 @@ } ], "homepage": { - "resource": "http://www.werelate.org" + "resource": "https://www.werelate.org" } } ] diff --git a/test/src/scrapers/werelate.js b/test/src/scrapers/werelate.js index 8f96ebe..d565265 100644 --- a/test/src/scrapers/werelate.js +++ b/test/src/scrapers/werelate.js @@ -1,15 +1,15 @@ var setupTest = require('../../testHelpers').createTestRunner('werelate'); - + describe('werelate', function(){ - + it('male', setupTest( 'washington', - 'http://www.werelate.org/wiki/Person:George_Washington_%286%29' + 'https://www.werelate.org/wiki/Person:George_Washington_%286%29' )); - + it('female and children', setupTest( 'mary-ball', - 'http://www.werelate.org/wiki/Person:Mary_Ball_%285%29' + 'https://www.werelate.org/wiki/Person:Mary_Ball_%285%29' )); - + }); \ No newline at end of file