Skip to content

Commit

Permalink
Add Twitter Support; Merge pull request #47 from blydro/master
Browse files Browse the repository at this point in the history
Follows the twitter guidelines.

Bug: T148835
  • Loading branch information
mvolz authored Dec 4, 2016
2 parents 2fa47e7 + aa94d72 commit e8c018d
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 8 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ html-metadata

# MetaData html scraper and parser for Node.js (supports Promises and callback style)

The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).
The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).

Planned is support for RDFa, Twitter, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!
Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!

## Install

Expand Down
11 changes: 11 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,17 @@ exports.parseSchemaOrgMicrodata = function(chtml, callback){
return index.parseSchemaOrgMicrodata(chtml).nodeify(callback);
};

/**
* Scrapes Twitter data given html object
*
* @param {Object} chtml html Cheerio object
* @param {Function} [callback] optional callback function
* @return {Object} BBPromise for metadata
*/
exports.parseTwitter = function(chtml, callback){
return index.parseTwitter(chtml).nodeify(callback);
};

/**
* Global exportable list of scraping promises with string keys
* @type {Object}
Expand Down
99 changes: 98 additions & 1 deletion lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,102 @@ exports.parseSchemaOrgMicrodata = BBPromise.method(function(chtml){
return meta;
});


/**
* Scrapes twitter microdata given Cheerio html object
* @param {Object} chtml html Cheerio object
* @return {Object} promise of twitter metadata object
*/
exports.parseTwitter = BBPromise.method(function(chtml) {
if (!chtml) {
throw new Error('Undefined argument');
}

var meta = {};
var metaTags = chtml('meta');

// These properties can either be strings or objects
var dualStateSubProperties = {
image : 'url',
player : 'url',
creator : '@username'
};

metaTags.each(function() {
var element = chtml(this);
var propertyValue = element.attr('name');

var property;
var content = element.attr('content');
var node;

// Exit if not a twitter tag
if (!propertyValue){
return;
} else {
propertyValue = propertyValue.toLowerCase().split(':');
property = propertyValue[1];
}

// Exit if tag not twitter metadata
if(propertyValue[0] !== 'twitter') {
return;
}

// Handle nested properties
if(propertyValue.length > 2) {
var subProperty = propertyValue[2];

// Upgrade the property to an object if it needs to be
if(property in dualStateSubProperties && !(meta[property] instanceof Object)) {
node = {};
node[dualStateSubProperties[property]] = meta[property];
meta[property] = []; // Clear out the existing string as we just placed it into our new node
}else {
node = meta[property] ? meta[property] : {}; // Either create a new node or ammend the existing one
}

// Differentiate betweeen twice and thrice nested properties
// Not the prettiest solution, but twitter metadata guidelines are fairly strict so it's not nessesary
// to anticipate strange data.
if(propertyValue.length === 3) {
node[subProperty] = content;
} else if (propertyValue.length === 4) {
// Solve the very specific twitter:player:stream:content_type case where stream needs to be upgraded to an object
if(subProperty.toLowerCase() === "stream"){
node[subProperty] = {url: node[subProperty] };
}else {
node[subProperty] = node[subProperty] ? node[subProperty] : {}; //Either create a new subnode or ammend the existing one
}
node[subProperty][propertyValue[3]] = content;
} else {
// Something is malformed, so exit
return;
}
}else {
node = content;
}

// Create array if property exists and is not a nested object
if(meta[property] && !(meta[property] instanceof Object)) {
if (meta[property] instanceof Array) {
meta[property].push(node);
} else {
meta[property] = [meta[property], node];
}
}else {
meta[property] = node;
}
});

if(Object.keys(meta).length === 0) {
throw new Error("No twitter metadata found on this page");
}

return meta;
});


/**
* Global exportable list of scraping promises with string keys
* @type {Object}
Expand All @@ -460,5 +556,6 @@ exports.metadataFunctions = {
'general': exports.parseGeneral,
'highwirePress': exports.parseHighwirePress,
'openGraph': exports.parseOpenGraph,
'schemaOrg': exports.parseSchemaOrgMicrodata
'schemaOrg': exports.parseSchemaOrgMicrodata,
'twitter': exports.parseTwitter
};
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "html-metadata",
"version": "1.4.4",
"version": "1.5.0",
"description": "Scrapes metadata of several different standards",
"main": "index.js",
"dependencies": {
Expand Down
10 changes: 10 additions & 0 deletions test/errors.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ describe('errors', function() {
});
});

it('should not find twitter metadata, reject promise', function() {
var url = 'http://example.com';
return preq.get(url)
.then(function(callRes) {
var $ = cheerio.load(callRes.body);
var prom = meta.parseTwitter($);
return assert.fails(prom);
});
});

//TODO: Add test for lacking general metadata
//TODO: Add test for lacking any metadata

Expand Down
27 changes: 26 additions & 1 deletion test/scraping.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ var cheerio = require('cheerio');

describe('scraping', function() {

this.timeout(40000);
this.timeout(50000);

var url;

Expand Down Expand Up @@ -144,4 +144,29 @@ describe('scraping', function() {
});
});

describe('twitter tests', function() {
it('should get most basic twitter info', function() {
url = 'http://www.aftenposten.no/kultur/Pinlig-for-Skaber-555558b.html';
return meta(url)
.catch(function(e){throw e;})
.then(function(res) {
['card', 'site', 'description', 'title', 'image'].forEach(function(key) {
if(!res.twitter[key]) {
throw new Error('Expected to find the ' + key + ' key in the response!');
}
});
});
});

it('should get twitter nested data correctly', function() {
url = 'http://www.theguardian.com/us';
return meta(url)
.catch(function(e){throw e;})
.then(function(res) {
var expected = '{"app":{"id":{"iphone":"409128287","ipad":"409128287","googleplay":"com.guardian"},"name":{"googleplay":"The Guardian","ipad":"The Guardian","iphone":"The Guardian"},"url":{"ipad":"gnmguardian://us?contenttype=front&source=twitter","iphone":"gnmguardian://us?contenttype=front&source=twitter"}},"site":"@guardian","card":"summary","url":"https://www.theguardian.com/us"}';
assert.deepEqual(JSON.stringify(res.twitter), expected);
});
});
});

});
Loading

0 comments on commit e8c018d

Please sign in to comment.