diff --git a/lib/index.js b/lib/index.js index 370fdaf..310f5ba 100644 --- a/lib/index.js +++ b/lib/index.js @@ -65,8 +65,8 @@ exports.parseBase = BBPromise.method(function(chtml, tags, reason, getProperty, var property = getProperty(element); var content = getContent(element); - // If the element isn't what we're looking for, skip it - if (!property) { + // If lacks property or content, skip + if (!property || !content) { return; } @@ -102,14 +102,15 @@ exports.parseBEPress = BBPromise.method(function(chtml){ ['meta'], 'No BE Press metadata found in page', function(element) { - var nameAttr = element.attr('name'); + var content = element.attr('content'); + var name = element.attr('name'); - // If the element isn't a BE Press property, skip it - if (!nameAttr || (nameAttr.substring(0, 17).toLowerCase() !== 'bepress_citation_')) { + // If the element isn't a BE Press property or if content is missing, skip it + if (!name || !content || (name.substring(0, 17).toLowerCase() !== 'bepress_citation_')) { return; } - return nameAttr.substring(17).toLowerCase(); + return name.substring(17).toLowerCase(); }, function(element) { return element.attr('content'); @@ -212,11 +213,12 @@ exports.parseDublinCore = BBPromise.method(function(chtml){ ['meta', 'link'], 'No Dublin Core metadata found in page', function(element) { - var isLink = element[0].name === 'link', - nameAttr = element.attr(isLink ? 'rel' : 'name'); + var isLink = element[0].name === 'link'; + var nameAttr = element.attr(isLink ? 'rel' : 'name'); + var value = element.attr(isLink ? 'href' : 'content'); - // If the element isn't a Dublin Core property, skip it - if (!nameAttr + // If the element isn't a Dublin Core property or if value is missing, skip it + if (!nameAttr || !value || (nameAttr.substring(0, 3).toUpperCase() !== 'DC.' && nameAttr.substring(0, 8).toUpperCase() !== 'DCTERMS.')) { return; @@ -247,9 +249,10 @@ exports.parseEprints = BBPromise.method(function(chtml){ 'No EPrints metadata found in page', function(element) { var nameAttr = element.attr('name'); + var content = element.attr('content'); - // If the element isn't an EPrints property, skip it - if (!nameAttr || nameAttr.substring(0, 8).toLowerCase() !== 'eprints.') { + // If the element isn't an EPrints property or content is missing, skip it + if (!nameAttr || !content || nameAttr.substring(0, 8).toLowerCase() !== 'eprints.') { return; } @@ -295,7 +298,7 @@ exports.parseGeneral = BBPromise.method(function(chtml){ var value; Object.keys(clutteredMeta).forEach(function(key){ value = clutteredMeta[key]; - if (value){ + if (value){ // Only add if has value meta[key] = value; } }); @@ -322,9 +325,10 @@ exports.parseHighwirePress = BBPromise.method(function(chtml){ 'No Highwire Press metadata found in page', function(element) { var nameAttr = element.attr('name'); + var content = element.attr('content'); // If the element isn't a Highwire Press property, skip it - if (!nameAttr || (nameAttr.substring(0, 9).toLowerCase() !== 'citation_')) { + if (!nameAttr || !content || (nameAttr.substring(0, 9).toLowerCase() !== 'citation_')) { return; } @@ -354,7 +358,11 @@ exports.parseJsonLd = BBPromise.method(function(chtml) { // Fail silently, just in case there are valid tags return; } - json.push(contents); + if (contents){ + json.push(contents); + } else { + return; + } }); if (json.length === 0) { @@ -370,10 +378,7 @@ exports.parseJsonLd = BBPromise.method(function(chtml) { * @return {Object} promise of open graph metadata object */ exports.parseOpenGraph = BBPromise.method(function(chtml){ - - var element; var itemType; - var propertyValue; var property; var node; var meta = {}; @@ -391,22 +396,21 @@ exports.parseOpenGraph = BBPromise.method(function(chtml){ if (!metaTags || metaTags.length === 0){ throw reason; } metaTags.each(function() { - element = chtml(this); - propertyValue = element.attr('property'); + var element = chtml(this); + var propertyValue = element.attr('property'); + var content = element.attr('content'); - if (!propertyValue){ + if (!propertyValue || !content){ return; } else { propertyValue = propertyValue.toLowerCase().split(':'); } - // If the element isn't in namespace, exit + // If the property isn't in namespace, exit if (namespace.indexOf(propertyValue[0]) < 0){ return; } - var content = element.attr('content'); - if (propertyValue.length === 2){ property = propertyValue[1]; // Set property to value after namespace if (property in subProperty){ // If has valid subproperty @@ -497,28 +501,28 @@ exports.parseTwitter = BBPromise.method(function(chtml) { metaTags.each(function() { var element = chtml(this); - var propertyValue = element.attr('name'); + var name = element.attr('name'); var property; var content = element.attr('content'); var node; - // Exit if not a twitter tag - if (!propertyValue){ + // Exit if not a twitter tag or content is missing + if (!name|| !content){ return; } else { - propertyValue = propertyValue.toLowerCase().split(':'); - property = propertyValue[1]; + name = name.toLowerCase().split(':'); + property = name[1]; } // Exit if tag not twitter metadata - if(propertyValue[0] !== 'twitter') { + if(name[0] !== 'twitter') { return; } // Handle nested properties - if(propertyValue.length > 2) { - var subProperty = propertyValue[2]; + if(name.length > 2) { + var subProperty = name[2]; // Upgrade the property to an object if it needs to be if(property in dualStateSubProperties && !(meta[property] instanceof Object)) { @@ -532,16 +536,16 @@ exports.parseTwitter = BBPromise.method(function(chtml) { // Differentiate betweeen twice and thrice nested properties // Not the prettiest solution, but twitter metadata guidelines are fairly strict so it's not nessesary // to anticipate strange data. - if(propertyValue.length === 3) { + if(name.length === 3) { node[subProperty] = content; - } else if (propertyValue.length === 4) { + } else if (name.length === 4) { // Solve the very specific twitter:player:stream:content_type case where stream needs to be upgraded to an object if(subProperty.toLowerCase() === "stream"){ node[subProperty] = {url: node[subProperty] }; }else { node[subProperty] = node[subProperty] ? node[subProperty] : {}; //Either create a new subnode or ammend the existing one } - node[subProperty][propertyValue[3]] = content; + node[subProperty][name[3]] = content; } else { // Something is malformed, so exit return; diff --git a/package.json b/package.json index 3b7384b..0bbadb3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "html-metadata", - "version": "1.6.2", + "version": "1.6.3", "description": "Scrapes metadata of several different standards", "main": "index.js", "dependencies": { @@ -20,10 +20,16 @@ "coverage": "istanbul cover _mocha -- -R spec" }, "keywords": [ + "bepress", + "coins", + "dublin core", + "eprints", + "highwire press", + "json-ld", "open graph", "metadata", "microdata", - "dublin core", + "twitter cards", "web scraper" ], "repository": { diff --git a/test/errors.js b/test/errors.js index 473cc47..fcc3403 100644 --- a/test/errors.js +++ b/test/errors.js @@ -109,9 +109,9 @@ describe('errors', function() { }); }); - it('should reject promise with malformed JSON-LD', function() { + it('should reject promise with malformed JSON-LD and missing content tags', function() { var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_errors.html')); - return assert.fails(meta.parseJsonLd($)); + return assert.fails(meta.parseAll($)); }); //TODO: Add test for lacking general metadata diff --git a/test/scraping.js b/test/scraping.js index 897e8f9..103ed8f 100644 --- a/test/scraping.js +++ b/test/scraping.js @@ -130,6 +130,7 @@ describe('scraping', function() { assert.deepEqual(JSON.stringify(res.openGraph.image), expectedImage); }); }); + }); it('should get Schema.org Microdata', function() { @@ -198,4 +199,19 @@ describe('scraping', function() { }); }); + it('should not have any undefined values', function() { + url = 'https://www.cnet.com/special-reports/vr101/'; + return preq.get(url).then(function(callRes) { + var chtml = cheerio.load(callRes.body); + return meta.parseAll(chtml) + .then(function(results) { + Object.keys(results).forEach(function(metadataType) { + Object.keys(results[metadataType]).forEach(function(key) { + assert.notDeepEqual(results[metadataType][key], undefined); // Ensure all values are not undefined in response + }); + }); + }); + }); + }); + }); diff --git a/test/static/turtle_article_errors.html b/test/static/turtle_article_errors.html index c5600d4..b024cf8 100644 --- a/test/static/turtle_article_errors.html +++ b/test/static/turtle_article_errors.html @@ -1,6 +1,9 @@ - + -Turtles are AWESOME!!1 | Invalid Turtles Website + + @@ -13,6 +16,71 @@ } + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +