Skip to content

Commit

Permalink
Don't add tags with missing content
Browse files Browse the repository at this point in the history
Previously, metadata with tags that were missing content
were being added, resulting in keys with the value
'undefined.' This skips adding tags with missing
values.

Adds static and live test.

Addresses issue #54
  • Loading branch information
mvolz authored and d00rman committed Feb 9, 2017
1 parent 958991b commit 7b7e7ee
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 41 deletions.
74 changes: 39 additions & 35 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ exports.parseBase = BBPromise.method(function(chtml, tags, reason, getProperty,
var property = getProperty(element);
var content = getContent(element);

// If the element isn't what we're looking for, skip it
if (!property) {
// If lacks property or content, skip
if (!property || !content) {
return;
}

Expand Down Expand Up @@ -102,14 +102,15 @@ exports.parseBEPress = BBPromise.method(function(chtml){
['meta'],
'No BE Press metadata found in page',
function(element) {
var nameAttr = element.attr('name');
var content = element.attr('content');
var name = element.attr('name');

// If the element isn't a BE Press property, skip it
if (!nameAttr || (nameAttr.substring(0, 17).toLowerCase() !== 'bepress_citation_')) {
// If the element isn't a BE Press property or if content is missing, skip it
if (!name || !content || (name.substring(0, 17).toLowerCase() !== 'bepress_citation_')) {
return;
}

return nameAttr.substring(17).toLowerCase();
return name.substring(17).toLowerCase();
},
function(element) {
return element.attr('content');
Expand Down Expand Up @@ -212,11 +213,12 @@ exports.parseDublinCore = BBPromise.method(function(chtml){
['meta', 'link'],
'No Dublin Core metadata found in page',
function(element) {
var isLink = element[0].name === 'link',
nameAttr = element.attr(isLink ? 'rel' : 'name');
var isLink = element[0].name === 'link';
var nameAttr = element.attr(isLink ? 'rel' : 'name');
var value = element.attr(isLink ? 'href' : 'content');

// If the element isn't a Dublin Core property, skip it
if (!nameAttr
// If the element isn't a Dublin Core property or if value is missing, skip it
if (!nameAttr || !value
|| (nameAttr.substring(0, 3).toUpperCase() !== 'DC.'
&& nameAttr.substring(0, 8).toUpperCase() !== 'DCTERMS.')) {
return;
Expand Down Expand Up @@ -247,9 +249,10 @@ exports.parseEprints = BBPromise.method(function(chtml){
'No EPrints metadata found in page',
function(element) {
var nameAttr = element.attr('name');
var content = element.attr('content');

// If the element isn't an EPrints property, skip it
if (!nameAttr || nameAttr.substring(0, 8).toLowerCase() !== 'eprints.') {
// If the element isn't an EPrints property or content is missing, skip it
if (!nameAttr || !content || nameAttr.substring(0, 8).toLowerCase() !== 'eprints.') {
return;
}

Expand Down Expand Up @@ -295,7 +298,7 @@ exports.parseGeneral = BBPromise.method(function(chtml){
var value;
Object.keys(clutteredMeta).forEach(function(key){
value = clutteredMeta[key];
if (value){
if (value){ // Only add if has value
meta[key] = value;
}
});
Expand All @@ -322,9 +325,10 @@ exports.parseHighwirePress = BBPromise.method(function(chtml){
'No Highwire Press metadata found in page',
function(element) {
var nameAttr = element.attr('name');
var content = element.attr('content');

// If the element isn't a Highwire Press property, skip it
if (!nameAttr || (nameAttr.substring(0, 9).toLowerCase() !== 'citation_')) {
if (!nameAttr || !content || (nameAttr.substring(0, 9).toLowerCase() !== 'citation_')) {
return;
}

Expand Down Expand Up @@ -354,7 +358,11 @@ exports.parseJsonLd = BBPromise.method(function(chtml) {
// Fail silently, just in case there are valid tags
return;
}
json.push(contents);
if (contents){
json.push(contents);
} else {
return;
}
});

if (json.length === 0) {
Expand All @@ -370,10 +378,7 @@ exports.parseJsonLd = BBPromise.method(function(chtml) {
* @return {Object} promise of open graph metadata object
*/
exports.parseOpenGraph = BBPromise.method(function(chtml){

var element;
var itemType;
var propertyValue;
var property;
var node;
var meta = {};
Expand All @@ -391,22 +396,21 @@ exports.parseOpenGraph = BBPromise.method(function(chtml){
if (!metaTags || metaTags.length === 0){ throw reason; }

metaTags.each(function() {
element = chtml(this);
propertyValue = element.attr('property');
var element = chtml(this);
var propertyValue = element.attr('property');
var content = element.attr('content');

if (!propertyValue){
if (!propertyValue || !content){
return;
} else {
propertyValue = propertyValue.toLowerCase().split(':');
}

// If the element isn't in namespace, exit
// If the property isn't in namespace, exit
if (namespace.indexOf(propertyValue[0]) < 0){
return;
}

var content = element.attr('content');

if (propertyValue.length === 2){
property = propertyValue[1]; // Set property to value after namespace
if (property in subProperty){ // If has valid subproperty
Expand Down Expand Up @@ -497,28 +501,28 @@ exports.parseTwitter = BBPromise.method(function(chtml) {

metaTags.each(function() {
var element = chtml(this);
var propertyValue = element.attr('name');
var name = element.attr('name');

var property;
var content = element.attr('content');
var node;

// Exit if not a twitter tag
if (!propertyValue){
// Exit if not a twitter tag or content is missing
if (!name|| !content){
return;
} else {
propertyValue = propertyValue.toLowerCase().split(':');
property = propertyValue[1];
name = name.toLowerCase().split(':');
property = name[1];
}

// Exit if tag not twitter metadata
if(propertyValue[0] !== 'twitter') {
if(name[0] !== 'twitter') {
return;
}

// Handle nested properties
if(propertyValue.length > 2) {
var subProperty = propertyValue[2];
if(name.length > 2) {
var subProperty = name[2];

// Upgrade the property to an object if it needs to be
if(property in dualStateSubProperties && !(meta[property] instanceof Object)) {
Expand All @@ -532,16 +536,16 @@ exports.parseTwitter = BBPromise.method(function(chtml) {
// Differentiate betweeen twice and thrice nested properties
// Not the prettiest solution, but twitter metadata guidelines are fairly strict so it's not nessesary
// to anticipate strange data.
if(propertyValue.length === 3) {
if(name.length === 3) {
node[subProperty] = content;
} else if (propertyValue.length === 4) {
} else if (name.length === 4) {
// Solve the very specific twitter:player:stream:content_type case where stream needs to be upgraded to an object
if(subProperty.toLowerCase() === "stream"){
node[subProperty] = {url: node[subProperty] };
}else {
node[subProperty] = node[subProperty] ? node[subProperty] : {}; //Either create a new subnode or ammend the existing one
}
node[subProperty][propertyValue[3]] = content;
node[subProperty][name[3]] = content;
} else {
// Something is malformed, so exit
return;
Expand Down
10 changes: 8 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "html-metadata",
"version": "1.6.2",
"version": "1.6.3",
"description": "Scrapes metadata of several different standards",
"main": "index.js",
"dependencies": {
Expand All @@ -20,10 +20,16 @@
"coverage": "istanbul cover _mocha -- -R spec"
},
"keywords": [
"bepress",
"coins",
"dublin core",
"eprints",
"highwire press",
"json-ld",
"open graph",
"metadata",
"microdata",
"dublin core",
"twitter cards",
"web scraper"
],
"repository": {
Expand Down
4 changes: 2 additions & 2 deletions test/errors.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ describe('errors', function() {
});
});

it('should reject promise with malformed JSON-LD', function() {
it('should reject promise with malformed JSON-LD and missing content tags', function() {
var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_errors.html'));
return assert.fails(meta.parseJsonLd($));
return assert.fails(meta.parseAll($));
});

//TODO: Add test for lacking general metadata
Expand Down
16 changes: 16 additions & 0 deletions test/scraping.js
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ describe('scraping', function() {
assert.deepEqual(JSON.stringify(res.openGraph.image), expectedImage);
});
});

});

it('should get Schema.org Microdata', function() {
Expand Down Expand Up @@ -198,4 +199,19 @@ describe('scraping', function() {
});
});

it('should not have any undefined values', function() {
url = 'https://www.cnet.com/special-reports/vr101/';
return preq.get(url).then(function(callRes) {
var chtml = cheerio.load(callRes.body);
return meta.parseAll(chtml)
.then(function(results) {
Object.keys(results).forEach(function(metadataType) {
Object.keys(results[metadataType]).forEach(function(key) {
assert.notDeepEqual(results[metadataType][key], undefined); // Ensure all values are not undefined in response
});
});
});
});
});

});
72 changes: 70 additions & 2 deletions test/static/turtle_article_errors.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
<html lang="en">
<html>
<head>
<title>Turtles are AWESOME!!1 | Invalid Turtles Website</title>
<!--
This file contains entirely invalid metadata and should case parseAll to fail
-->
<title></title>
</head>

<body>
Expand All @@ -13,6 +16,71 @@
}
</script>

<meta name="author" badcontent="Turtle Lvr">
<meta name="robots" badcontent="we welcome our robot overlords"/>
<meta name="description" badcontent="Exposition on the awesomeness of turtles"/>
<meta name="keywords" badcontent="turtle, movie" />

<link rel="canonical" badhref="http://example.com/turtles" />
<link rel="publisher" badhref="https://mediawiki.org"/>
<link rel="author" badhref="http://examples.com/turtlelvr"/>
<link rel="shortlink" badhref="http://example.com/c" />

<!--Open Graph-->

<meta property="og:locale" badcontent="en_US" />
<meta property="og:type" badcontent="video.movie" />
<meta property="og:title" badcontent="Turtles of the Jungle" />
<meta property="og:description" badcontent="A 2008 film about jungle turtles." />
<meta property="og:url" badcontent="http://example.com" />
<meta property="og:site_name" badcontent="Awesome Turtle Movies Website" />
<meta property="og:image" badcontent="http://example.com/turtle.jpg" />
<meta property="og:image" badcontent="http://example.com/shell.jpg" />

<meta property="video:tag" badcontent="turtle" />
<meta property="video:tag" badcontent="movie" />
<meta property="video:tag" badcontent="awesome" />
<meta property="video:director" badcontent="http://www.example.com/PhilTheTurtle" />
<meta property="video:actor" badcontent="http://www.example.com/PatTheTurtle" />
<meta property="video:actor:role" badcontent="Turtle #3" /> <!-- Currently ignored -->
<meta property="video:actor" badcontent="http://www.example.com/SaminaTheTurtle" />
<meta property="video:writer" badcontent="http://www.example.com/TinaTheTurtle" />
<meta property="video:release_date" badcontent="2015-01-14T19:14:27+00:00" />
<meta property="video:duration" badcontent="1000000" />

<!--AL-->

<meta property="al:ios:url" badcontent="turtle://">
<meta property="al:ios:app_store_id" badcontent="000">
<meta property="al:android:url" badcontent="turtle://">
<meta property="al:android:package" badcontent="superturtlearticle.androidapp">
<meta property="al:web:url" badcontent="http://example.com/">
<meta property="al:web:should_fallback" badcontent="true">

<!--Twitter-->

<meta name="twitter:card" badcontent="summary">
<meta name="twitter:site" badcontent="@Turtlessssssssss">
<meta name="twitter:creator" badcontent="@Turtlessssssssss">
<meta name="twitter:url" badcontent="http://www.example.com/turtles">
<meta name="twitter:title" badcontent="Turtles of the Jungle">
<meta name="twitter:description" badcontent="A 2008 film about jungle turtles.">
<meta name="twitter:player" badcontent="http://www.example.com/turtles/player">
<meta name="twitter:player:width" badcontent="400">
<meta name="twitter:player:height" badcontent="400">
<meta name="twitter:player:stream" badcontent="http://www.example.com/turtles/turtle.mp4">
<meta name="twitter:player:stream:badcontent_type" badcontent="video/mp4">

<!--Dublin Core-->

<meta name="DC.Title" badcontent="Turtles of the Jungle" >
<meta name="DC.Creator" badcontent="http://www.example.com/turtlelvr" >
<meta name="DC.Description" badcontent="A 2008 film about jungle turtles." >
<meta name="DC.Date" badcontent="2012-02-04 12:00:00" >
<meta name="DC.Type" badcontent="Image.Moving" >



</body>

</html>

0 comments on commit 7b7e7ee

Please sign in to comment.