From e22d20f88ca3f9555393ca80e7764dbb0df687ca Mon Sep 17 00:00:00 2001 From: kotylo Date: Sat, 11 Dec 2021 20:24:34 +0100 Subject: [PATCH] Replace fuzzysearch with fuzzyset.js --- src/background.js | 2 +- src/content-script.js | 149 ++++++++++++-------- src/libraries/fuzzyset.js | 281 ++++++++++++++++++++++++++++++++++++++ src/manifest.json | 2 +- 4 files changed, 373 insertions(+), 61 deletions(-) create mode 100644 src/libraries/fuzzyset.js diff --git a/src/background.js b/src/background.js index d72c8fe..e7a2039 100644 --- a/src/background.js +++ b/src/background.js @@ -14,7 +14,7 @@ chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { if (request.action === "findMovie") { // todo: add &ttype=ft (film titles?) to the options? - fetch(`https://www.imdb.com/find?q=${request.movie.name}&s=tt&ttype=ft&ref_=fn_ft&count=3`) + fetch(`https://www.imdb.com/find?q=${request.movie.name}&s=tt&ttype=ft&ttype=tv&ref_=fn_ft&count=3`) .then(res => res.text()) .then(html => sendResponse(html)); } else if (request.action === "getMovie") { diff --git a/src/content-script.js b/src/content-script.js index 09b0b58..bdfd075 100644 --- a/src/content-script.js +++ b/src/content-script.js @@ -45,10 +45,17 @@ function showRating(movie) { let link = document.createElement("a"); link.href = `https://www.imdb.com${movie.href}`; link.target = "_blank"; + if (movie.imdbName != null) { + link.title = `${movie.imdbName} on IMDB`; + } insertAfter(link, movie.iconElement); let imdb = document.createElement("div"); - imdb.textContent = `${movie.rating}`; + if (movie.rating == null) { + imdb.textContent = `N/A`; + } else { + imdb.textContent = `${movie.rating}`; + } let ratingCountPercentage = getOpacityPercentage(movie.ratingCount); let alphaColor = getColorFromPercentage(ratingCountPercentage); @@ -67,12 +74,14 @@ function showRating(movie) { imdb.style.float = "left"; link.appendChild(imdb); - let votesContainer = document.createElement("div"); - votesContainer.style.float = "right"; - votesContainer.style.fontSize = "5px"; - votesContainer.textContent = `${movie.ratingCount} v.`; - votesContainer.style.marginLeft = "5px"; - imdb.appendChild(votesContainer); + if (movie.ratingCount > 0) { + let votesContainer = document.createElement("div"); + votesContainer.style.float = "right"; + votesContainer.style.fontSize = "5px"; + votesContainer.textContent = `${movie.ratingCount} v.`; + votesContainer.style.marginLeft = "5px"; + imdb.appendChild(votesContainer); + } } function getColorFromPercentage(percentage) { @@ -94,8 +103,8 @@ function getOpacityPercentage(ratingCount) { } function getMovieInfo(movie) { - // in case you need to debug some specific movie: - // if (movie.name.indexOf(" Son") > 0) { + // //in case you need to debug some specific movie: + // if (movie.name.indexOf("One Shot") > 0) { // return getMovieInfoFromIMDB(movie); // } // return; @@ -153,27 +162,9 @@ function getMovieInfoFromIMDB(movie) { return; } - let a = null; - for (let i = 0; i < links.length; i++) { - let link = links[i]; - let aElements = link.getElementsByTagName("a"); - - for (let j = 0; j < aElements.length; j++) { - let aElement = aElements[j]; - if (aElement.textContent == ""){ - continue; - } - - var linkMovieName = getMovieName(link.textContent); - if (fuzzyMatch(linkMovieName, movie.name)) { - a = aElement; - break; - } - } - - if (a != null) { - break; - } + let a = findLinkWithFuzziness(links, movie, 0.75); + if (a == null){ + a = findLinkWithFuzziness(links, movie, 0.5); } if (a == null){ @@ -193,6 +184,7 @@ function getMovieInfoFromIMDB(movie) { } movie.id = id; movie.href = href; + movie.imdbName = getMovieName(a.textContent); return movie; }) .then(movie => { @@ -232,20 +224,34 @@ function getMovieInfoFromIMDB(movie) { return; } var parsedJson = JSON.parse(json); - let rating = parsedJson.aggregateRating.ratingValue; - if (rating == null) { - console.error("rating is null"); - return; - } + if (parsedJson.aggregateRating != null) { + let rating = parsedJson.aggregateRating.ratingValue; + if (rating == null) { + console.error("rating is null"); + return; + } - let ratingCount = parsedJson.aggregateRating.ratingCount; - if (ratingCount == null) { - console.error("ratingCount is null"); - return; - } + let ratingCount = parsedJson.aggregateRating.ratingCount; + if (ratingCount == null) { + console.error("ratingCount is null"); + return; + } - movie.rating = rating; - movie.ratingCount = ratingCount; + movie.rating = rating; + movie.ratingCount = ratingCount; + }else{ + // in case the movie is not rated, set the rating to 0 + movie.rating = 0; + movie.ratingCount = 0; + + // try to get the release date + let releaseDate = parsedJson.datePublished; + if (releaseDate == null) { + console.error(`aggregateRating is null and releaseDate is null for movie '${movie.name}'`); + return; + } + movie.releaseDate = releaseDate; + } return movie; }); }) @@ -266,28 +272,53 @@ function getMovieInfoFromIMDB(movie) { }); } -function getLinkToImdbText(movieName) { - return `Try to call it yourself: https://www.imdb.com/find?q=${movieName.replace(/\s/g, "%20")}&s=tt&ttype=ft&ref_=fn_ft&count=3`; -} +function findLinkWithFuzziness(links, movie, fuzzyValue){ + let a = null; + for (let i = 0; i < links.length; i++) { + let link = links[i]; + let aElements = link.getElementsByTagName("a"); -function fuzzyMatch(str1, str2) { - let str1Lower = str1.toLowerCase(); - let str2Lower = str2.toLowerCase(); - let str1Length = str1.length; - let str2Length = str2.length; + for (let j = 0; j < aElements.length; j++) { + let aElement = aElements[j]; + if (aElement.textContent == ""){ + continue; + } - let maxLength = str1Length > str2Length ? str1Length : str2Length; + let linkMovieName = getMovieName(aElement.textContent); + let currentMovieNameWithoutYear = movie.name.replace(/\s\d{4}\s*$/g, ""); + if (fuzzyMatch(linkMovieName, currentMovieNameWithoutYear, fuzzyValue)) { + a = aElement; + break; + } + } - let count = 0; - for (let i = 0; i < maxLength; i++) { - let char1 = str1Lower[i]; - let char2 = str2Lower[i]; - if (char1 == char2) { - count++; + if (a != null) { + break; } + + if (i > 5){ + console.log(`skipping more than 5 links for movie '${movie.name}'`); + break; + } + } + return a; +} + +function getLinkToImdbText(movieName) { + return `Try to call it yourself: https://www.imdb.com/find?q=${movieName.replace(/\s/g, "%20")}&s=tt&ttype=ft&ttype=tv&ref_=fn_ft&count=3`; +} + +function fuzzyMatch(str1, str2, fuzzyScore) { + let results = FuzzySet([str1]).get(str2); + if (results == null){ + return false; } - return count / maxLength > 0.7; + let [result] = results; + if (result.length > 1){ + return result[0] >= fuzzyScore; + } + return false; } function getScriptWithType(scripts, type) { @@ -311,7 +342,7 @@ function getMovieName(rawName) { movieName = movieName.replace(/[\(\)]/g, ""); // remove everything after year in the movieName let year = movieName.match(/\s\d{4}\s/); - if (year.length > 0) { + if (year != null && year.length > 0) { movieName = movieName.substring(0, year.index + year[0].length); } movieName = movieName.trim(); diff --git a/src/libraries/fuzzyset.js b/src/libraries/fuzzyset.js new file mode 100644 index 0000000..fccbea8 --- /dev/null +++ b/src/libraries/fuzzyset.js @@ -0,0 +1,281 @@ +var FuzzySet = (function () { + 'use strict'; + + const FuzzySet = function(arr, useLevenshtein, gramSizeLower, gramSizeUpper) { + var fuzzyset = { + + }; + + // default options + arr = arr || []; + fuzzyset.gramSizeLower = gramSizeLower || 2; + fuzzyset.gramSizeUpper = gramSizeUpper || 3; + fuzzyset.useLevenshtein = (typeof useLevenshtein !== 'boolean') ? true : useLevenshtein; + + // define all the object functions and attributes + fuzzyset.exactSet = {}; + fuzzyset.matchDict = {}; + fuzzyset.items = {}; + + // helper functions + var levenshtein = function(str1, str2) { + var current = [], prev, value; + + for (var i = 0; i <= str2.length; i++) + for (var j = 0; j <= str1.length; j++) { + if (i && j) + if (str1.charAt(j - 1) === str2.charAt(i - 1)) + value = prev; + else + value = Math.min(current[j], current[j - 1], prev) + 1; + else + value = i + j; + + prev = current[j]; + current[j] = value; + } + + return current.pop(); + }; + + // return an edit distance from 0 to 1 + var _distance = function(str1, str2) { + if (str1 === null && str2 === null) throw 'Trying to compare two null values'; + if (str1 === null || str2 === null) return 0; + str1 = String(str1); str2 = String(str2); + + var distance = levenshtein(str1, str2); + if (str1.length > str2.length) { + return 1 - distance / str1.length; + } else { + return 1 - distance / str2.length; + } + }; + var _nonWordRe = /[^a-zA-Z0-9\u00C0-\u00FF, ]+/g; + + var _iterateGrams = function(value, gramSize) { + gramSize = gramSize || 2; + var simplified = '-' + value.toLowerCase().replace(_nonWordRe, '') + '-', + lenDiff = gramSize - simplified.length, + results = []; + if (lenDiff > 0) { + for (var i = 0; i < lenDiff; ++i) { + simplified += '-'; + } + } + for (var i = 0; i < simplified.length - gramSize + 1; ++i) { + results.push(simplified.slice(i, i + gramSize)); + } + return results; + }; + + var _gramCounter = function(value, gramSize) { + // return an object where key=gram, value=number of occurrences + gramSize = gramSize || 2; + var result = {}, + grams = _iterateGrams(value, gramSize), + i = 0; + for (i; i < grams.length; ++i) { + if (grams[i] in result) { + result[grams[i]] += 1; + } else { + result[grams[i]] = 1; + } + } + return result; + }; + + // the main functions + fuzzyset.get = function(value, defaultValue, minMatchScore) { + // check for value in set, returning defaultValue or null if none found + if (minMatchScore === undefined) { + minMatchScore = .33; + } + var result = this._get(value, minMatchScore); + if (!result && typeof defaultValue !== 'undefined') { + return defaultValue; + } + return result; + }; + + fuzzyset._get = function(value, minMatchScore) { + var results = []; + // start with high gram size and if there are no results, go to lower gram sizes + for (var gramSize = this.gramSizeUpper; gramSize >= this.gramSizeLower; --gramSize) { + results = this.__get(value, gramSize, minMatchScore); + if (results && results.length > 0) { + return results; + } + } + return null; + }; + + fuzzyset.__get = function(value, gramSize, minMatchScore) { + var normalizedValue = this._normalizeStr(value), + matches = {}, + gramCounts = _gramCounter(normalizedValue, gramSize), + items = this.items[gramSize], + sumOfSquareGramCounts = 0, + gram, + gramCount, + i, + index, + otherGramCount; + + for (gram in gramCounts) { + gramCount = gramCounts[gram]; + sumOfSquareGramCounts += Math.pow(gramCount, 2); + if (gram in this.matchDict) { + for (i = 0; i < this.matchDict[gram].length; ++i) { + index = this.matchDict[gram][i][0]; + otherGramCount = this.matchDict[gram][i][1]; + if (index in matches) { + matches[index] += gramCount * otherGramCount; + } else { + matches[index] = gramCount * otherGramCount; + } + } + } + } + + function isEmptyObject(obj) { + for(var prop in obj) { + if(obj.hasOwnProperty(prop)) + return false; + } + return true; + } + + if (isEmptyObject(matches)) { + return null; + } + + var vectorNormal = Math.sqrt(sumOfSquareGramCounts), + results = [], + matchScore; + // build a results list of [score, str] + for (var matchIndex in matches) { + matchScore = matches[matchIndex]; + results.push([matchScore / (vectorNormal * items[matchIndex][0]), items[matchIndex][1]]); + } + var sortDescending = function(a, b) { + if (a[0] < b[0]) { + return 1; + } else if (a[0] > b[0]) { + return -1; + } else { + return 0; + } + }; + results.sort(sortDescending); + if (this.useLevenshtein) { + var newResults = [], + endIndex = Math.min(50, results.length); + // truncate somewhat arbitrarily to 50 + for (var i = 0; i < endIndex; ++i) { + newResults.push([_distance(results[i][1], normalizedValue), results[i][1]]); + } + results = newResults; + results.sort(sortDescending); + } + newResults = []; + results.forEach(function(scoreWordPair) { + if (scoreWordPair[0] >= minMatchScore) { + newResults.push([scoreWordPair[0], this.exactSet[scoreWordPair[1]]]); + } + }.bind(this)); + return newResults; + }; + + fuzzyset.add = function(value) { + var normalizedValue = this._normalizeStr(value); + if (normalizedValue in this.exactSet) { + return false; + } + + var i = this.gramSizeLower; + for (i; i < this.gramSizeUpper + 1; ++i) { + this._add(value, i); + } + }; + + fuzzyset._add = function(value, gramSize) { + var normalizedValue = this._normalizeStr(value), + items = this.items[gramSize] || [], + index = items.length; + + items.push(0); + var gramCounts = _gramCounter(normalizedValue, gramSize), + sumOfSquareGramCounts = 0, + gram, gramCount; + for (gram in gramCounts) { + gramCount = gramCounts[gram]; + sumOfSquareGramCounts += Math.pow(gramCount, 2); + if (gram in this.matchDict) { + this.matchDict[gram].push([index, gramCount]); + } else { + this.matchDict[gram] = [[index, gramCount]]; + } + } + var vectorNormal = Math.sqrt(sumOfSquareGramCounts); + items[index] = [vectorNormal, normalizedValue]; + this.items[gramSize] = items; + this.exactSet[normalizedValue] = value; + }; + + fuzzyset._normalizeStr = function(str) { + if (Object.prototype.toString.call(str) !== '[object String]') throw 'Must use a string as argument to FuzzySet functions'; + return str.toLowerCase(); + }; + + // return length of items in set + fuzzyset.length = function() { + var count = 0, + prop; + for (prop in this.exactSet) { + if (this.exactSet.hasOwnProperty(prop)) { + count += 1; + } + } + return count; + }; + + // return is set is empty + fuzzyset.isEmpty = function() { + for (var prop in this.exactSet) { + if (this.exactSet.hasOwnProperty(prop)) { + return false; + } + } + return true; + }; + + // return list of values loaded into set + fuzzyset.values = function() { + var values = [], + prop; + for (prop in this.exactSet) { + if (this.exactSet.hasOwnProperty(prop)) { + values.push(this.exactSet[prop]); + } + } + return values; + }; + + + // initialization + var i = fuzzyset.gramSizeLower; + for (i; i < fuzzyset.gramSizeUpper + 1; ++i) { + fuzzyset.items[i] = []; + } + // add all the items to the set + for (i = 0; i < arr.length; ++i) { + fuzzyset.add(arr[i]); + } + + return fuzzyset; + }; + + return FuzzySet; + +}()); diff --git a/src/manifest.json b/src/manifest.json index ba9b547..87300bd 100644 --- a/src/manifest.json +++ b/src/manifest.json @@ -23,7 +23,7 @@ "content_scripts": [ { "matches": ["*://1337x.to/*"], - "js": ["content-script.js"] + "js": ["libraries/fuzzyset.js", "content-script.js"] } ], "host_permissions": ["https://www.imdb.com/*"]