From 7b7e7eef9caa9d54e9e5c5a9cf79649be87c1536 Mon Sep 17 00:00:00 2001 From: Marielle Volz Date: Mon, 6 Feb 2017 10:37:52 +0000 Subject: [PATCH] Don't add tags with missing content Previously, metadata with tags that were missing content were being added, resulting in keys with the value 'undefined.' This skips adding tags with missing values. Adds static and live test. Addresses issue #54 --- lib/index.js | 74 ++++++++++++++------------ package.json | 10 +++- test/errors.js | 4 +- test/scraping.js | 16 ++++++ test/static/turtle_article_errors.html | 72 ++++++++++++++++++++++++- 5 files changed, 135 insertions(+), 41 deletions(-) diff --git a/lib/index.js b/lib/index.js index 370fdaf..310f5ba 100644 --- a/lib/index.js +++ b/lib/index.js @@ -65,8 +65,8 @@ exports.parseBase = BBPromise.method(function(chtml, tags, reason, getProperty, var property = getProperty(element); var content = getContent(element); - // If the element isn't what we're looking for, skip it - if (!property) { + // If lacks property or content, skip + if (!property || !content) { return; } @@ -102,14 +102,15 @@ exports.parseBEPress = BBPromise.method(function(chtml){ ['meta'], 'No BE Press metadata found in page', function(element) { - var nameAttr = element.attr('name'); + var content = element.attr('content'); + var name = element.attr('name'); - // If the element isn't a BE Press property, skip it - if (!nameAttr || (nameAttr.substring(0, 17).toLowerCase() !== 'bepress_citation_')) { + // If the element isn't a BE Press property or if content is missing, skip it + if (!name || !content || (name.substring(0, 17).toLowerCase() !== 'bepress_citation_')) { return; } - return nameAttr.substring(17).toLowerCase(); + return name.substring(17).toLowerCase(); }, function(element) { return element.attr('content'); @@ -212,11 +213,12 @@ exports.parseDublinCore = BBPromise.method(function(chtml){ ['meta', 'link'], 'No Dublin Core metadata found in page', function(element) { - var isLink = element[0].name === 'link', - nameAttr = element.attr(isLink ? 'rel' : 'name'); + var isLink = element[0].name === 'link'; + var nameAttr = element.attr(isLink ? 'rel' : 'name'); + var value = element.attr(isLink ? 'href' : 'content'); - // If the element isn't a Dublin Core property, skip it - if (!nameAttr + // If the element isn't a Dublin Core property or if value is missing, skip it + if (!nameAttr || !value || (nameAttr.substring(0, 3).toUpperCase() !== 'DC.' && nameAttr.substring(0, 8).toUpperCase() !== 'DCTERMS.')) { return; @@ -247,9 +249,10 @@ exports.parseEprints = BBPromise.method(function(chtml){ 'No EPrints metadata found in page', function(element) { var nameAttr = element.attr('name'); + var content = element.attr('content'); - // If the element isn't an EPrints property, skip it - if (!nameAttr || nameAttr.substring(0, 8).toLowerCase() !== 'eprints.') { + // If the element isn't an EPrints property or content is missing, skip it + if (!nameAttr || !content || nameAttr.substring(0, 8).toLowerCase() !== 'eprints.') { return; } @@ -295,7 +298,7 @@ exports.parseGeneral = BBPromise.method(function(chtml){ var value; Object.keys(clutteredMeta).forEach(function(key){ value = clutteredMeta[key]; - if (value){ + if (value){ // Only add if has value meta[key] = value; } }); @@ -322,9 +325,10 @@ exports.parseHighwirePress = BBPromise.method(function(chtml){ 'No Highwire Press metadata found in page', function(element) { var nameAttr = element.attr('name'); + var content = element.attr('content'); // If the element isn't a Highwire Press property, skip it - if (!nameAttr || (nameAttr.substring(0, 9).toLowerCase() !== 'citation_')) { + if (!nameAttr || !content || (nameAttr.substring(0, 9).toLowerCase() !== 'citation_')) { return; } @@ -354,7 +358,11 @@ exports.parseJsonLd = BBPromise.method(function(chtml) { // Fail silently, just in case there are valid tags return; } - json.push(contents); + if (contents){ + json.push(contents); + } else { + return; + } }); if (json.length === 0) { @@ -370,10 +378,7 @@ exports.parseJsonLd = BBPromise.method(function(chtml) { * @return {Object} promise of open graph metadata object */ exports.parseOpenGraph = BBPromise.method(function(chtml){ - - var element; var itemType; - var propertyValue; var property; var node; var meta = {}; @@ -391,22 +396,21 @@ exports.parseOpenGraph = BBPromise.method(function(chtml){ if (!metaTags || metaTags.length === 0){ throw reason; } metaTags.each(function() { - element = chtml(this); - propertyValue = element.attr('property'); + var element = chtml(this); + var propertyValue = element.attr('property'); + var content = element.attr('content'); - if (!propertyValue){ + if (!propertyValue || !content){ return; } else { propertyValue = propertyValue.toLowerCase().split(':'); } - // If the element isn't in namespace, exit + // If the property isn't in namespace, exit if (namespace.indexOf(propertyValue[0]) < 0){ return; } - var content = element.attr('content'); - if (propertyValue.length === 2){ property = propertyValue[1]; // Set property to value after namespace if (property in subProperty){ // If has valid subproperty @@ -497,28 +501,28 @@ exports.parseTwitter = BBPromise.method(function(chtml) { metaTags.each(function() { var element = chtml(this); - var propertyValue = element.attr('name'); + var name = element.attr('name'); var property; var content = element.attr('content'); var node; - // Exit if not a twitter tag - if (!propertyValue){ + // Exit if not a twitter tag or content is missing + if (!name|| !content){ return; } else { - propertyValue = propertyValue.toLowerCase().split(':'); - property = propertyValue[1]; + name = name.toLowerCase().split(':'); + property = name[1]; } // Exit if tag not twitter metadata - if(propertyValue[0] !== 'twitter') { + if(name[0] !== 'twitter') { return; } // Handle nested properties - if(propertyValue.length > 2) { - var subProperty = propertyValue[2]; + if(name.length > 2) { + var subProperty = name[2]; // Upgrade the property to an object if it needs to be if(property in dualStateSubProperties && !(meta[property] instanceof Object)) { @@ -532,16 +536,16 @@ exports.parseTwitter = BBPromise.method(function(chtml) { // Differentiate betweeen twice and thrice nested properties // Not the prettiest solution, but twitter metadata guidelines are fairly strict so it's not nessesary // to anticipate strange data. - if(propertyValue.length === 3) { + if(name.length === 3) { node[subProperty] = content; - } else if (propertyValue.length === 4) { + } else if (name.length === 4) { // Solve the very specific twitter:player:stream:content_type case where stream needs to be upgraded to an object if(subProperty.toLowerCase() === "stream"){ node[subProperty] = {url: node[subProperty] }; }else { node[subProperty] = node[subProperty] ? node[subProperty] : {}; //Either create a new subnode or ammend the existing one } - node[subProperty][propertyValue[3]] = content; + node[subProperty][name[3]] = content; } else { // Something is malformed, so exit return; diff --git a/package.json b/package.json index 3b7384b..0bbadb3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "html-metadata", - "version": "1.6.2", + "version": "1.6.3", "description": "Scrapes metadata of several different standards", "main": "index.js", "dependencies": { @@ -20,10 +20,16 @@ "coverage": "istanbul cover _mocha -- -R spec" }, "keywords": [ + "bepress", + "coins", + "dublin core", + "eprints", + "highwire press", + "json-ld", "open graph", "metadata", "microdata", - "dublin core", + "twitter cards", "web scraper" ], "repository": { diff --git a/test/errors.js b/test/errors.js index 473cc47..fcc3403 100644 --- a/test/errors.js +++ b/test/errors.js @@ -109,9 +109,9 @@ describe('errors', function() { }); }); - it('should reject promise with malformed JSON-LD', function() { + it('should reject promise with malformed JSON-LD and missing content tags', function() { var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_errors.html')); - return assert.fails(meta.parseJsonLd($)); + return assert.fails(meta.parseAll($)); }); //TODO: Add test for lacking general metadata diff --git a/test/scraping.js b/test/scraping.js index 897e8f9..103ed8f 100644 --- a/test/scraping.js +++ b/test/scraping.js @@ -130,6 +130,7 @@ describe('scraping', function() { assert.deepEqual(JSON.stringify(res.openGraph.image), expectedImage); }); }); + }); it('should get Schema.org Microdata', function() { @@ -198,4 +199,19 @@ describe('scraping', function() { }); }); + it('should not have any undefined values', function() { + url = 'https://www.cnet.com/special-reports/vr101/'; + return preq.get(url).then(function(callRes) { + var chtml = cheerio.load(callRes.body); + return meta.parseAll(chtml) + .then(function(results) { + Object.keys(results).forEach(function(metadataType) { + Object.keys(results[metadataType]).forEach(function(key) { + assert.notDeepEqual(results[metadataType][key], undefined); // Ensure all values are not undefined in response + }); + }); + }); + }); + }); + }); diff --git a/test/static/turtle_article_errors.html b/test/static/turtle_article_errors.html index c5600d4..b024cf8 100644 --- a/test/static/turtle_article_errors.html +++ b/test/static/turtle_article_errors.html @@ -1,6 +1,9 @@ - + -Turtles are AWESOME!!1 | Invalid Turtles Website + + @@ -13,6 +16,71 @@ } + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +