Skip to content

Commit

Permalink
Improve hydration hits and output summary after run
Browse files Browse the repository at this point in the history
  • Loading branch information
alistairjcbrown committed Nov 1, 2024
1 parent 28f7eec commit 079e9b3
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 52 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/generate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ jobs:
- run: npm run generate electriccinema.co.uk-portobello
- run: npm run generate electriccinema.co.uk-white-city

# Run post-data scripts
- run: npm run output:highlight-hydration-misses-for-review

# Create release with assets
- uses: ncipollo/release-action@v1
id: release
Expand Down
7 changes: 5 additions & 2 deletions common/bfi.org.uk/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ function getOverviewFor({ html }) {
.text()
.trim()
.toLowerCase();
const content = $(this).find(".Film-info__information__value").text();
const content = $(this)
.find(".Film-info__information__value")
.text()
.trim();

if (heading === "director" && overview.directors.length === 0) {
overview.directors = splitConjoinedItemsInList(convertToList(content));
Expand All @@ -31,7 +34,7 @@ function getOverviewFor({ html }) {
} else if (heading === "certificate" && !overview.certification) {
overview.certification = content;
} else {
const hasTimings = content.match(/\s+(\d{4}).\s+(\d+)min$/i);
const hasTimings = content.match(/\s+(\d{4}).\s+(\d+)min(?:\s|$)/i);
if (hasTimings && !overview.year) {
overview.year = hasTimings[1];
}
Expand Down
52 changes: 4 additions & 48 deletions common/hydrate.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
const slugify = require("slugify");
const { MovieDb } = require("moviedb-promise");
const { dailyCache } = require("./cache");
const knownRemovablePhrases = require("./known-removable-phrases.json");
const { parseMinsToMs } = require("./utils");
const normalizeTitle = require("./normalize-title");
require("dotenv").config();

const moviedb = new MovieDb(process.env.MOVIEDB_API_KEY);
Expand All @@ -29,51 +29,6 @@ const getMovieInfoAndCacheResults = ({ id }) =>
return moviedb.movieInfo(payload);
});

function normalize(title) {
title = title.toLowerCase();

const hasPresents = title.match(/\s+presents:?\s+(.*?)$/i);
if (hasPresents) {
title = hasPresents[1];
}

const hasPresented = title.match(/^(.*?)\s+presented\s+/i);
if (hasPresented) {
title = hasPresented[1];
}

const hasSeparator = title.match(/^(.*?)\s+(?:\+|\-)\s*/);
if (hasSeparator) {
title = hasSeparator[1];
}

const hasSquareBracketDate = title.trim().match(/^(.*?)\[(\d{4})\](.*?)$/);
if (hasSquareBracketDate) {
title = `${hasSquareBracketDate[1]}(${hasSquareBracketDate[2]})${hasSquareBracketDate[3]}`;
}

const hasBrackets = title.match(/^(.*?)\s+\[/);
if (hasBrackets) {
title = hasBrackets[1];
}

knownRemovablePhrases.forEach((phrase) => {
title = title.replace(phrase.toLowerCase(), "");
});

const hasYear = title.trim().match(/\(\d{4}\)$/);
if (!hasYear) {
title = title.replace(/\([^(]*\)$/, "").trim();
title = title.replace(/\([^(]*\)$/, "").trim(); // Do it twice in case there's more paraenthesis
}

return title
.replace(/\s*:\s+/g, ": ")
.trim()
.replace(/:$/, "")
.trim();
}

const getMovieTitleAndYearFrom = (title) => {
const hasYear = title.trim().match(/^(.*?)\s*\((\d{4})\)$/);
if (hasYear)
Expand All @@ -90,8 +45,9 @@ function getBestMatch(titleQuery, results) {

// If there's only one match that has the same title, then pick it
const matches = results.filter(
({ title }) => title.toLowerCase() === titleQuery.toLowerCase(),
({ title }) => normalizeTitle(title) === titleQuery.toLowerCase(),
);

if (matches.length === 1) return matches[0];

// Otherwise if there's a bunch which match the title, pick the most popular
Expand All @@ -107,7 +63,7 @@ function getBestMatch(titleQuery, results) {
async function hydrate(shows) {
return await Promise.all(
shows.map(async (show) => {
const title = normalize(show.title);
const title = normalizeTitle(show.title);
const { title: normalizedTitle, year } = getMovieTitleAndYearFrom(title);
const slug = slugify(normalizedTitle, { strict: true }).toLowerCase();
const search = await searchMovieAndCacheResults({
Expand Down
13 changes: 12 additions & 1 deletion common/known-removable-phrases.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
": Final Cut",
"Theatrical Cut",
"Director's Cut",
"Extended Cut",
"40th Anniversar",
"PINK PALACE:",
"Classic Matinee:",
Expand Down Expand Up @@ -79,10 +80,20 @@
"Philosophical Screens:",
"Sunday Premiere:",
"Horror with Hitchcock:",
"Dog-Friendly Screening:",
"Dog-Friendly Screening",
"Green Screen:",
"SLA:",
"/ Broken Social Scene",
"| The Ritzy",
"FILM CLUB:"
"FILM CLUB:",
"Secret Ceremony special screening:",
"pitchblack playback:",
": stu spasm",
": black & neurodiverse movie club launch",
"Closing Night:",
"MADE IN PRAGUE:",
": Love in the Age of Anarchy",
": Reverie Cineclub: Fairytales from around the World",
": Reverie Cineclub"
]
60 changes: 60 additions & 0 deletions common/normalize-title.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
const knownRemovablePhrases = require("./known-removable-phrases.json");

function normalizeTitle(title) {
title = title.toLowerCase();

const removablePrefixes = [
"Scared To Dance -",
"Hitchcock: The Gainsborough Days -",
];

removablePrefixes.forEach((phrase) => {
title = title.replace(phrase.toLowerCase(), "");
});

const hasPresents = title.match(/\s+presents?:?\s+(.*?)$/i);
if (hasPresents) {
title = hasPresents[1];
}

const hasPresented = title.match(/^(.*?)\s+presented\s+/i);
if (hasPresented) {
title = hasPresented[1];
}

const hasSeparator = title.match(/^(.*?)\s+(?:\+|\-)\s*/);
if (hasSeparator) {
title = hasSeparator[1];
}

const hasSquareBracketDate = title.trim().match(/^(.*?)\[(\d{4})\](.*?)$/);
if (hasSquareBracketDate) {
title = `${hasSquareBracketDate[1]}(${hasSquareBracketDate[2]})${hasSquareBracketDate[3]}`;
}

const hasBrackets = title.match(/^(.*?)\s+\[/);
if (hasBrackets) {
title = hasBrackets[1];
}

knownRemovablePhrases.forEach((phrase) => {
title = title.replace(phrase.toLowerCase(), "");
});

const hasYear = title.trim().match(/\(\d{4}\)$/);
if (!hasYear) {
title = title.replace(/\([^(]*\)$/, "").trim();
title = title.replace(/\([^(]*\)$/, "").trim(); // Do it twice in case there's more paraenthesis
}

return title
.replace(/\s*:\s+/g, ": ")
.trim()
.replace(/:$/, "")
.replace(/'|/g, "")
.replace(/\s+(-|)(\s|$)/g, " ")
.replace(/\s+/g, " ")
.trim();
}

module.exports = normalizeTitle;
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"scripts": {
"test": "jest",
"format": "prettier \"./**/*.js(on)?\" --write && prettier \"./**/*.md\" --write --prose-wrap=always",
"generate": "TZ=Europe/London node index.js"
"generate": "TZ=Europe/London node index.js",
"output:highlight-hydration-misses-for-review": "node ./scripts/highlight-hydration-misses-for-review.js"
},
"author": "Alistair Brown <[email protected]>",
"license": "MIT",
Expand Down
8 changes: 8 additions & 0 deletions scripts/get-latest-release-assets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
REPO_URL='https://api.github.com/repos/alistairjcbrown/hackney-cinema-calendar/releases/latest'

RESPONSE_LIST=$(curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" $REPO_URL)

for f in $(echo "$RESPONSE_LIST" | grep browser_download | grep json | cut -d\" -f4);
do
wget "$f" -P ./output/
done
107 changes: 107 additions & 0 deletions scripts/highlight-hydration-misses-for-review.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
const path = require("node:path");
const getSites = require("../common/get-sites");
const normalizeTitle = require("../common/normalize-title");

const termsExpectedToNotMatch = [
// Multiple films
/All\-Nighter/i,
/Marathon/i,
/Double Feature/i,
/Trilogy/i,
/Mystery Movie/i,

// Non feature film events
/Comedy Night/i,
/\s+Comedy$/i,
/Concert/i,
/Quiz/i,
/Filmmakers.*?Club/i,
/TV Preview:/i,
/short Film/i,
/ shorts: /i,
/^Behind the Scenes:/i,
/^Member Exclusive: .* Tour$/i,
/Session \d+/i,
/Season \d+/i,
/Programme \d+/i,
/Live in 3D/i,
/New Writings/i,
/Lecture:/i,
/Animation Workshop/i,
/Poetry Slam/i,
/ in conversation/i,
/Stunt Saturday/i,
/the art of /i,

// Comunity events
/Library Talk/i,
/Library Research Session/i,
/Womens Voices Forum/i,
/Free Talk:/i,
/Raising Awareness of/i,

// Film festival
/Opening Gala/i,
/Film Awards/i,

// Live recordings
/^NT Live:/i,
/^RBO[^:]*:/i,
/^The Royal Ballet:/i,
/^Met Opera[^:]*:/i,
/^The Metropolitan Opera:/i,
/^The Royal Opera:/i,
/^MACBETH:/i,
/EXHIBITION ON SCREEN:/i,
/^Play for Today:/i,
/^Performance:/i,

// Music
/at The Ritzy/i,
/Live Sessions/i,
/Pitchblack Playback/i,
/Dub Me Always:/i,
/Your Gospel Night/i,
/Funky Stuff/i,
/Vinyl Sisters/i,
];

const expectedMatch = ({ title }) => {
const notExpectedToMatch = termsExpectedToNotMatch.some(
(term) => !!title.toLowerCase().match(term),
);
if (notExpectedToMatch) return false;
return true;
};

const data = getSites().reduce(
(mapping, site) => ({
...mapping,
[site]: require(path.join(__dirname, "..", "output", `${site}-shows.json`)),
}),
{},
);

const flaggedForReview = {};
Object.keys(data).forEach((site) => {
const siteData = data[site];
siteData.forEach((show) => {
if (!show.moviedb && expectedMatch(show)) {
flaggedForReview[show.title] = flaggedForReview[show.title] || [];
flaggedForReview[show.title].push({ site, show });
}
});
});

Object.keys(flaggedForReview).forEach((key, index) => {
const matches = flaggedForReview[key];
const normalizedTitle = normalizeTitle(key);
const year = matches[0].show.overview.year;
console.log(`${index + 1}. "${normalizedTitle}"${year ? ` (${year})` : ""}`);
console.log(` - Original: "${key}"`);
console.log(
` - Search for matches: https://www.themoviedb.org/search/movie?query=${encodeURIComponent(normalizedTitle)}`,
);
console.log(` - Source: ${matches[0].show.url}`);
console.log(" ");
});

0 comments on commit 079e9b3

Please sign in to comment.