Skip to content

Commit

Permalink
CLDR-17934 site: parse sitemap.tsv
Browse files Browse the repository at this point in the history
  • Loading branch information
srl295 committed Sep 11, 2024
1 parent 6e6d6f5 commit 0c73460
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 85 deletions.
2 changes: 1 addition & 1 deletion docs/site/_layouts/page.html
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
</div>
</div>
<div class="message"><i>This navigation UI is temporary, just to give access to the pages.</i></div>
<div class="bar"><a href="/sitemap" class="bar">Site Map</a></div>
<!-- <div class="bar"><a href="/sitemap" class="bar">Site Map</a></div> -->
</header>


Expand Down
223 changes: 139 additions & 84 deletions docs/site/assets/js/build.mjs
Original file line number Diff line number Diff line change
@@ -1,16 +1,33 @@
// extract site frontmatter, save to json
// extract site frontmatter and read from /sitemap.tsv, save to json

import * as fs from "node:fs/promises";
import * as path from "node:path";
import { default as process } from "node:process";
import { default as matter } from "gray-matter";
import { SitemapStream, streamToPromise } from "sitemap";
import { Readable } from "node:stream";
import { Dirent } from "node:fs";

// utilities and constants

// files to skip
const SKIP_THESE = /(node_modules|\.jekyll-cache|^sitemap.*)/;

// final URL of site
const SITE = "https://cldr.unicode.org";

// input file
const SITEMAPFILE = "sitemap.tsv";

// utility collator
const coll = new Intl.Collator(["und"]);

/**
* Directory Crawler: process one directory
* @param {string} d directory paren
* @param {string} fullPath path to this file
* @param {object} out output object
*/
async function processFile(d, fullPath, out) {
const f = await fs.readFile(fullPath, "utf-8");
const m = matter(f);
Expand All @@ -22,7 +39,13 @@ async function processFile(d, fullPath, out) {
}
}

/** process one dirent */
/**
* Directory Crawler: process one dirent
* @param {string} d directory paren
* @param {object} out output object
* @param {Dirent} e directory entry
* @returns
*/
async function processEntry(d, out, e) {
const fullpath = path.join(d, e.name);
if (SKIP_THESE.test(e.name)) return;
Expand All @@ -35,6 +58,7 @@ async function processEntry(d, out, e) {
}

/**
* Directory Crawler: kick off the crawl (or subcrawl) of a directory
* @param {string} d path to directory
* @param {object} out output struct
*/
Expand Down Expand Up @@ -65,6 +89,11 @@ function dropmd(p) {
return p.replace(/\.md$/, "");
}

/**
*
* @param {number} n
* @returns string with n tabs
*/
function tabs(n) {
let s = [];
for (let i = 0; i < n; i++) {
Expand All @@ -73,60 +102,12 @@ function tabs(n) {
return s.join("");
}

/** convert a markdown path to a final URL */
function mkurl(p) {
return `${SITE}/${md2html(p)}`;
}

const coll = new Intl.Collator(["und"]);

function writeSiteMapSheet({ all, allDirs }, path, outsheet) {
// write my index
function indexForPath(p) {
if (p === "") {
p = "index.md";
} else {
p = path2dir(p) + ".md";
}
return all.findIndex(({ fullPath }) => fullPath === p);
}
const myIndex = indexForPath(path);
if (myIndex === -1) {
throw Error(`Could not find index for ${path}`);
}
const { title, fullPath: indexPath } = all[myIndex];
// find how how much to indent.
// 'path' is '' or 'foo/' or 'foo/bar/baz/' at this point.
const slashes = path.replace(/[^\/]+/g, ""); // foo/bar/ => //
const indent = tabs(slashes.length); // number of slashes => number of tabs
outsheet.push(`${indent}${dropmd(indexPath)}`);

// now, gather the children.
const children = all.filter(({ fullPath }) => {
if (fullPath === indexPath) return false; // no self-list.
const myDir = path2dir(fullPath);
// would this item be under our dir?
if (`${myDir}.md` === indexPath) return true;
// special case for odd /index subdir
if (indexPath === `index.md` && myDir === "") return true;
return false;
});

children.sort((a, b) => coll.compare(a.title, b.title));

children.forEach(({ title, fullPath }) => {
// if an index, recurse instead.
const baseName = dropmd(fullPath); // downloads.md -> downloads
if (allDirs.has(baseName)) {
// it's a non-leaf node, recurse.
writeSiteMapSheet({ all, allDirs }, `${baseName}/`, outsheet);
} else {
// write leaf (non-index) child pages
outsheet.push(`${indent}\t${baseName}`);
}
});
}

async function writeSiteMaps(out) {
async function writeXmlSiteMap(out) {
// simple list of links
const links = await Promise.all(
out.all.map(async ({ fullPath, title }) => {
Expand All @@ -143,52 +124,126 @@ async function writeSiteMaps(out) {
).toString();
await fs.writeFile("./sitemap.xml", data, "utf-8");
console.log(`Wrote sitemap.xml with ${links.length} entries`);
}

const allSorted = [...out.all].sort((a, b) =>
coll.compare(a.fullPath, b.fullPath)
);
await fs.writeFile(
"./sitemap.md",
`---\ntitle: Site Map\n---\n\n` +
allSorted
.map(
({ fullPath, title }) =>
`- [/${fullPath}](/${dropmd(fullPath)}) - ${title}`
)
.join("\n"),
"utf-8"
);
console.log("Wrote sitemap.md");

// now, create sitemap.tsv by walking
const outsheet = [];
const allPaths = out.all.map(({ fullPath }) => fullPath);
// Find all 'directories' (ending with /)
const allDirs = new Set();
allPaths.forEach((p) => {
const segs = p.split("/").slice(0, -1); // ['', 'dir1']
for (let n = 0; n <= segs.length; n++) {
// add all parent paths, so: '', dir1, dir1/dir2 etc.
const subpath = segs.slice(0, n).join("/");
allDirs.add(subpath);
async function readTsvSiteMap(out) {
console.log(`Reading ${SITEMAPFILE}`);
const lines = (await fs.readFile(SITEMAPFILE, "utf-8")).split("\n"); // don't skip comment lines here so we can get line numbers.
const errors = [];

// user's specified map
const usermap = {
/*
index: {
parent: null,
title: 'CLDR Site',
children: [
'cldr-spec',
'downloads',
],
},
'cldr-spec': {
parent: 'index',
title: …,
children: [
'cldr-spec/collation-guidelines',
],
},
'cldr-spec/collation-guidelines': {
parent: 'cldr-spec',
title: …,
children: null,
},
*/
};
// stack of parents, in order
let parents = [];
let n = 0;
for (let line of lines) {
n++;
const location = `${SITEMAPFILE}:${n}: `; // for errors
// skip comment or blank lines
if (/^[ \t]*#/.test(line) || !line.trim()) continue;

// # of leading
const tabs = /^[\t]*/.exec(line)[0].length;
// rest of line: the actual path
const path = line.slice(tabs).trim();
if (usermap[path]) {
errors.push(`${location} duplicate path: ${path}`);
continue;
}
const foundItem = out.all.find(({ fullPath }) => fullPath === `${path}.md`);
if (!foundItem) {
errors.push(`${location} could not find file: ${path}.md`);
continue;
}
if (!foundItem.title) {
errors.push(`${location} missing title in ${path}.md`);
// let this continue on
}
usermap[path] = {
title: foundItem.title ?? path,
};
const parentCount = parents.length;
if (tabs < parentCount) {
/**
* index [1]
* foo [2]
*
*/
// outdent
if (tabs == 0) {
errors.push(`${location} can't have more than one root page!`);
break;
}
// drop 'n' parents
parents = parents.slice(0, tabs);
} else if (tabs > parentCount) {
// Error - wrong indent
errors.push(
`${location} indent too deep (expected ${parentCount} tabs at most)`
);
continue;
}
const parent = parents.slice(-1)[0] || null; // calculate parent (null for index page)
usermap[path].parent = parent;
if (parent) {
// not for index
usermap[parent].children = usermap[parent].children ?? [];
usermap[parent].children.push(path);
}
parents.push(path); // for next time
}
out.usermap = usermap;
out.all.forEach(({ fullPath }) => {
if (!usermap[dropmd(fullPath)]) {
errors.push(`${SITEMAPFILE}: missing: ${dropmd(fullPath)}`);
}
});

writeSiteMapSheet({ all: out.all, allDirs }, "", outsheet);
await fs.writeFile("./sitemap.tsv", outsheet.join("\n"), "utf-8");
console.log(`wrote sitemap.tsv with ${outsheet.length} entries`);
if (errors.length) {
errors.forEach((l) => console.error(l));
throw Error(`${errors.length} errors reading tsv`);
} else {
console.log(`${SITEMAPFILE} Valid.`);
}
}

/** top level async */
async function main() {
const out = {
all: [],
dirs: {},
};
await fs.mkdir("assets/json/", { recursive: true });
await traverse(".", out);
await writeXmlSiteMap(out);
await readTsvSiteMap(out);
// write final json asset
await fs.writeFile("assets/json/tree.json", JSON.stringify(out, null, " "));
console.log("Wrote assets/json/tree.json");
await writeSiteMaps(out);
}

main().then(
Expand Down

0 comments on commit 0c73460

Please sign in to comment.