From c9c5db86ede5cfc769fd486e35004294f00cc7a3 Mon Sep 17 00:00:00 2001 From: sophiamersmann Date: Thu, 22 Feb 2024 15:27:06 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=90=9B=20fix=20script=20to=20refresh?= =?UTF-8?q?=20pageviews?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/refreshPageviewsFromDatasette.ts | 35 ++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/db/refreshPageviewsFromDatasette.ts b/db/refreshPageviewsFromDatasette.ts index 745a436a5af..23ce0998ac9 100644 --- a/db/refreshPageviewsFromDatasette.ts +++ b/db/refreshPageviewsFromDatasette.ts @@ -2,6 +2,19 @@ import fetch from "node-fetch" import Papa from "papaparse" import * as db from "./db.js" +import { DbPlainAnalyticsPageview } from "@ourworldindata/types" +import { omitUndefinedValues } from "@ourworldindata/utils" + +const analyticsPageviewsColumnNames: Array = [ + "day", + "url", + "views_7d", + "views_14d", + "views_365d", +] + +const emojiRegex = + /[\u{1f300}-\u{1f5ff}\u{1f900}-\u{1f9ff}\u{1f600}-\u{1f64f}\u{1f680}-\u{1f6ff}\u{2600}-\u{26ff}\u{2700}-\u{27bf}\u{1f1e6}-\u{1f1ff}\u{1f191}-\u{1f251}\u{1f004}\u{1f0cf}\u{1f170}-\u{1f171}\u{1f17e}-\u{1f17f}\u{1f18e}\u{3030}\u{2b50}\u{2b55}\u{2934}-\u{2935}\u{2b05}-\u{2b07}\u{2b1b}-\u{2b1c}\u{3297}\u{3299}\u{303d}\u{00a9}\u{00ae}\u{2122}\u{23f3}\u{24c2}\u{23e9}-\u{23ef}\u{25b6}\u{23f8}-\u{23fa}]/gu async function downloadAndInsertCSV(): Promise { // Fetch CSV from private Datasette and insert it to a local MySQL. This function @@ -18,7 +31,7 @@ async function downloadAndInsertCSV(): Promise { } const csvText = await response.text() - const parsedData = Papa.parse(csvText, { + const parsedData = Papa.parse>(csvText, { header: true, }) @@ -27,12 +40,24 @@ async function downloadAndInsertCSV(): Promise { return } - const onlyValidRows = [...parsedData.data].filter( - (row) => Object.keys(row as any).length === 5 - ) as any[] + const table = [...parsedData.data].map((parsedRow) => { + const row: Partial = {} + for (const key of analyticsPageviewsColumnNames) { + row[key] = parsedRow[key] + } + return omitUndefinedValues(row) + }) + + const onlyValidRows = table.filter( + (row) => + row.day !== undefined && + row.url !== undefined && + // MySQL complains about emoji characters, so we filter them out + !row.url.match(emojiRegex) + ) console.log("Parsed CSV data:", onlyValidRows.length, "rows") - console.log("Columns:", parsedData.meta.fields) + console.log("Columns:", analyticsPageviewsColumnNames.join(", ")) // TODO: this instance should be handed down as a parameter const knex = db.knexInstance() From e3ddf6302a6fe55e95984510096c3ea760764a80 Mon Sep 17 00:00:00 2001 From: sophiamersmann Date: Thu, 22 Feb 2024 15:33:06 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=92=84=20pass=20knex=20down=20as=20pa?= =?UTF-8?q?rameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/refreshPageviewsFromDatasette.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/db/refreshPageviewsFromDatasette.ts b/db/refreshPageviewsFromDatasette.ts index 23ce0998ac9..d561b1903f1 100644 --- a/db/refreshPageviewsFromDatasette.ts +++ b/db/refreshPageviewsFromDatasette.ts @@ -4,6 +4,7 @@ import Papa from "papaparse" import * as db from "./db.js" import { DbPlainAnalyticsPageview } from "@ourworldindata/types" import { omitUndefinedValues } from "@ourworldindata/utils" +import { Knex } from "knex" const analyticsPageviewsColumnNames: Array = [ "day", @@ -16,7 +17,7 @@ const analyticsPageviewsColumnNames: Array = [ const emojiRegex = /[\u{1f300}-\u{1f5ff}\u{1f900}-\u{1f9ff}\u{1f600}-\u{1f64f}\u{1f680}-\u{1f6ff}\u{2600}-\u{26ff}\u{2700}-\u{27bf}\u{1f1e6}-\u{1f1ff}\u{1f191}-\u{1f251}\u{1f004}\u{1f0cf}\u{1f170}-\u{1f171}\u{1f17e}-\u{1f17f}\u{1f18e}\u{3030}\u{2b50}\u{2b55}\u{2934}-\u{2935}\u{2b05}-\u{2b07}\u{2b1b}-\u{2b1c}\u{3297}\u{3299}\u{303d}\u{00a9}\u{00ae}\u{2122}\u{23f3}\u{24c2}\u{23e9}-\u{23ef}\u{25b6}\u{23f8}-\u{23fa}]/gu -async function downloadAndInsertCSV(): Promise { +async function downloadAndInsertCSV(knex: Knex): Promise { // Fetch CSV from private Datasette and insert it to a local MySQL. This function // exists because `make refresh` uses MySQL dump that excludes analytics_pageviews // table. That's why it's necessary to call `make refresh.pageviews` separately. @@ -59,9 +60,6 @@ async function downloadAndInsertCSV(): Promise { console.log("Parsed CSV data:", onlyValidRows.length, "rows") console.log("Columns:", analyticsPageviewsColumnNames.join(", ")) - // TODO: this instance should be handed down as a parameter - const knex = db.knexInstance() - await knex.transaction(async (trx) => { await db.knexRaw("TRUNCATE TABLE analytics_pageviews", trx) @@ -72,7 +70,8 @@ async function downloadAndInsertCSV(): Promise { const main = async (): Promise => { try { - await downloadAndInsertCSV() + const knex = db.knexInstance() + await downloadAndInsertCSV(knex) } catch (e) { console.error(e) } finally { From 6f299a5e94999ae5f3c4ad0ac171966334b9e962 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Tue, 27 Feb 2024 15:08:13 +0100 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=94=A8=20move=20truncate=20out=20of?= =?UTF-8?q?=20transaction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/refreshPageviewsFromDatasette.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/db/refreshPageviewsFromDatasette.ts b/db/refreshPageviewsFromDatasette.ts index d561b1903f1..89d9e185e88 100644 --- a/db/refreshPageviewsFromDatasette.ts +++ b/db/refreshPageviewsFromDatasette.ts @@ -60,9 +60,8 @@ async function downloadAndInsertCSV(knex: Knex): Promise { console.log("Parsed CSV data:", onlyValidRows.length, "rows") console.log("Columns:", analyticsPageviewsColumnNames.join(", ")) + await db.knexRaw("TRUNCATE TABLE analytics_pageviews", knex) await knex.transaction(async (trx) => { - await db.knexRaw("TRUNCATE TABLE analytics_pageviews", trx) - await trx.batchInsert("analytics_pageviews", onlyValidRows) }) console.log("CSV data inserted successfully!")