Add Twitter Support; Merge pull request #47 from blydro/master

Follows the twitter guidelines. Bug: T148835
wikimedia · Dec 4, 2016 · e8c018d · e8c018d
2 parents 2fa47e7 + aa94d72
commit e8c018d
Show file tree

Hide file tree

Showing 10 changed files with 170 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -3,9 +3,9 @@ html-metadata
 
 # MetaData html scraper and parser for Node.js (supports Promises and callback style)
 
-The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).
+The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).
 
-Planned is support for RDFa, Twitter, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!
+Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!
 
 ## Install
 

diff --git a/index.js b/index.js
@@ -143,6 +143,17 @@ exports.parseSchemaOrgMicrodata = function(chtml, callback){
 	return index.parseSchemaOrgMicrodata(chtml).nodeify(callback);
 };
 
+/**
+ * Scrapes Twitter data given html object
+ *
+ * @param  {Object}   chtml      html Cheerio object
+ * @param  {Function} [callback] optional callback function
+ * @return {Object}              BBPromise for metadata
+ */
+exports.parseTwitter = function(chtml, callback){
+	return index.parseTwitter(chtml).nodeify(callback);
+};
+
 /**
  * Global exportable list of scraping promises with string keys
  * @type {Object}

diff --git a/lib/index.js b/lib/index.js
@@ -448,6 +448,102 @@ exports.parseSchemaOrgMicrodata = BBPromise.method(function(chtml){
 	return meta;
 });
 
+
+/**
+ * Scrapes twitter microdata given Cheerio html object
+ * @param  {Object}   chtml html Cheerio object
+ * @return {Object}   promise of twitter metadata object
+ */
+exports.parseTwitter = BBPromise.method(function(chtml) {
+	if (!chtml) {
+		throw new Error('Undefined argument');
+	}
+
+	var meta = {};
+	var metaTags = chtml('meta');
+
+	// These properties can either be strings or objects
+	var dualStateSubProperties = {
+			image : 'url',
+			player : 'url',
+			creator : '@username'
+		};
+
+	metaTags.each(function() {
+		var element = chtml(this);
+		var propertyValue = element.attr('name');
+
+		var property;
+		var content = element.attr('content');
+		var node;
+
+		// Exit if not a twitter tag
+		if (!propertyValue){
+			return;
+		} else {
+			propertyValue = propertyValue.toLowerCase().split(':');
+			property = propertyValue[1];
+		}
+
+		// Exit if tag not twitter metadata
+		if(propertyValue[0] !== 'twitter') {
+			return;
+		}
+
+		// Handle nested properties
+		if(propertyValue.length > 2) {
+			var subProperty = propertyValue[2];
+
+			// Upgrade the property to an object if it needs to be
+			if(property in dualStateSubProperties && !(meta[property] instanceof Object)) {
+				node = {};
+				node[dualStateSubProperties[property]] = meta[property];
+				meta[property] = []; // Clear out the existing string as we just placed it into our new node
+			}else {
+				node = meta[property] ? meta[property] : {}; // Either create a new node or ammend the existing one
+			}
+
+			// Differentiate betweeen twice and thrice nested properties
+			// Not the prettiest solution, but twitter metadata guidelines are fairly strict so it's not nessesary
+			// to anticipate strange data.
+			if(propertyValue.length === 3) {
+				node[subProperty] = content;
+			} else if (propertyValue.length === 4) {
+				// Solve the very specific twitter:player:stream:content_type case where stream needs to be upgraded to an object
+				if(subProperty.toLowerCase() === "stream"){
+					node[subProperty] = {url: node[subProperty] };
+				}else {
+					node[subProperty] = node[subProperty] ? node[subProperty] : {}; //Either create a new subnode or ammend the existing one
+				}
+				node[subProperty][propertyValue[3]] = content;
+			} else {
+				// Something is malformed, so exit
+				return;
+			}
+		}else {
+			node = content;
+		}
+
+		// Create array if property exists and is not a nested object
+		if(meta[property] && !(meta[property] instanceof Object)) {
+			if (meta[property] instanceof Array) {
+				meta[property].push(node);
+			} else {
+				meta[property] = [meta[property], node];
+			}
+		}else {
+			meta[property] = node;
+		}
+	});
+
+	if(Object.keys(meta).length === 0) {
+		throw new Error("No twitter metadata found on this page");
+	}
+
+	return meta;
+});
+
+
 /**
  * Global exportable list of scraping promises with string keys
  * @type {Object}
@@ -460,5 +556,6 @@ exports.metadataFunctions = {
 	'general': exports.parseGeneral,
 	'highwirePress': exports.parseHighwirePress,
 	'openGraph': exports.parseOpenGraph,
-	'schemaOrg': exports.parseSchemaOrgMicrodata
+	'schemaOrg': exports.parseSchemaOrgMicrodata,
+	'twitter': exports.parseTwitter
 };
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "html-metadata",
-  "version": "1.4.4",
+  "version": "1.5.0",
   "description": "Scrapes metadata of several different standards",
   "main": "index.js",
   "dependencies": {

diff --git a/test/errors.js b/test/errors.js
@@ -88,6 +88,16 @@ describe('errors', function() {
 		});
 	});
 
+	it('should not find twitter metadata, reject promise', function() {
+		var url = 'http://example.com';
+		return preq.get(url)
+		.then(function(callRes) {
+			var $ = cheerio.load(callRes.body);
+			var prom = meta.parseTwitter($);
+			return assert.fails(prom);
+		});
+	});
+
 	//TODO: Add test for lacking general metadata
 	//TODO: Add test for lacking any metadata
 

diff --git a/test/scraping.js b/test/scraping.js
@@ -16,7 +16,7 @@ var cheerio = require('cheerio');
 
 describe('scraping', function() {
 
-	this.timeout(40000);
+	this.timeout(50000);
 
 	var url;
 
@@ -144,4 +144,29 @@ describe('scraping', function() {
 		});
 	});
 
+	describe('twitter tests', function() {
+		it('should get most basic twitter info', function() {
+			url = 'http://www.aftenposten.no/kultur/Pinlig-for-Skaber-555558b.html';
+			return meta(url)
+			.catch(function(e){throw e;})
+			.then(function(res) {
+				['card', 'site', 'description', 'title', 'image'].forEach(function(key) {
+					if(!res.twitter[key]) {
+						throw new Error('Expected to find the ' + key + ' key in the response!');
+					}
+				});
+			});
+		});
+
+		it('should get twitter nested data correctly', function() {
+			url = 'http://www.theguardian.com/us';
+			return meta(url)
+			.catch(function(e){throw e;})
+			.then(function(res) {
+				var expected = '{"app":{"id":{"iphone":"409128287","ipad":"409128287","googleplay":"com.guardian"},"name":{"googleplay":"The Guardian","ipad":"The Guardian","iphone":"The Guardian"},"url":{"ipad":"gnmguardian://us?contenttype=front&source=twitter","iphone":"gnmguardian://us?contenttype=front&source=twitter"}},"site":"@guardian","card":"summary","url":"https://www.theguardian.com/us"}';
+				assert.deepEqual(JSON.stringify(res.twitter), expected);
+			});
+		});
+	});
+
 });