Skip to content

Commit

Permalink
Generate a static index.html for the version download archives
Browse files Browse the repository at this point in the history
[As suggested by @Turbo87][zulip], we should really have an index here
so people can easily enumerate the version download archives for
whatever nefarious purposes they have.

This will probably need some S3 side configuration as well to serve up
the index.html on demand.

I've set this up as a separate job from `ArchiveVersionDownloads` just
in case we ever need to run it separately, but I don't feel very
strongly about that. It could just as easily be a few new functions in
that module and run as part of that job, instead of being triggered by
that job.

This requires #9206.

[zulip]: https://rust-lang.zulipchat.com/#narrow/stream/318791-t-crates-io/topic/archiving.20.60version_downloads.60.20to.20S3/near/455657391
  • Loading branch information
LawnGnome committed Aug 1, 2024
1 parent e6bdfa2 commit d0b053b
Show file tree
Hide file tree
Showing 9 changed files with 364 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/admin/enqueue_job.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub enum Command {
/// The date before which to archive version downloads (default: 90 days ago)
before: Option<NaiveDate>,
},
IndexVersionDownloadsArchive,
UpdateDownloads,
CleanProcessedLogFiles,
DumpDb,
Expand Down Expand Up @@ -54,6 +55,9 @@ pub fn run(command: Command) -> Result<()> {
.unwrap_or_default()
.enqueue(conn)?;
}
Command::IndexVersionDownloadsArchive => {
jobs::IndexVersionDownloadsArchive.enqueue(conn)?;

Check warning on line 59 in src/admin/enqueue_job.rs

View check run for this annotation

Codecov / codecov/patch

src/admin/enqueue_job.rs#L59

Added line #L59 was not covered by tests
}
Command::UpdateDownloads => {
let count: i64 = background_jobs::table
.filter(background_jobs::job_type.eq(jobs::UpdateDownloads::JOB_NAME))
Expand Down
4 changes: 4 additions & 0 deletions src/worker/jobs/archive_version_downloads.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ impl BackgroundJob for ArchiveVersionDownloads {
let uploaded_dates = upload(downloads_archive_store, tempdir.path(), dates).await?;
delete(&env.deadpool, uploaded_dates).await?;

// Queue up the job to regenerate the archive index.
let conn: &mut AsyncConnectionWrapper<_> = &mut env.deadpool.get().await?.into();
super::IndexVersionDownloadsArchive.enqueue(conn)?;

Check warning on line 72 in src/worker/jobs/archive_version_downloads.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/archive_version_downloads.rs#L71-L72

Added lines #L71 - L72 were not covered by tests

info!("Finished archiving old version downloads");
Ok(())
}
Expand Down
21 changes: 21 additions & 0 deletions src/worker/jobs/index_version_downloads_archive/index.html.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>crates.io version download archives</title>
</head>
<body>
<h1>crates.io version download archives</h1>

<table>
<tbody>
{% for file in files %}
<tr>
<td><a href="{{ file.name }}">{{ file.name }}</a></td>
<td>{{ file.size }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</body>
</html>
232 changes: 232 additions & 0 deletions src/worker/jobs/index_version_downloads_archive/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
use std::{collections::BTreeSet, sync::Arc};

use anyhow::Context;
use crates_io_worker::BackgroundJob;
use futures_util::TryStreamExt;
use object_store::{ObjectMeta, ObjectStore};

use crate::worker::Environment;

const INDEX_PATH: &str = "archive/version-downloads/index.html";

/// Generate an index.html for the version download CSVs exported to S3.
#[derive(Serialize, Deserialize, Default)]

Check warning on line 13 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L13

Added line #L13 was not covered by tests
pub struct IndexVersionDownloadsArchive;

impl BackgroundJob for IndexVersionDownloadsArchive {
const JOB_NAME: &'static str = "index_version_downloads_archive";

type Context = Arc<Environment>;

async fn run(&self, env: Self::Context) -> anyhow::Result<()> {
info!("Indexing old version downloads…");

Check warning on line 22 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L21-L22

Added lines #L21 - L22 were not covered by tests

let Some(downloads_archive_store) = env.downloads_archive_store.as_ref() else {
warn!("No downloads archive store configured");
return Ok(());

Check warning on line 26 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L24-L26

Added lines #L24 - L26 were not covered by tests
};

info!("Generating and uploading index.html…");
if let Err(error) = generate(downloads_archive_store).await {
warn!("Error generating index.html: {error}");
return Err(error);
}
info!("index.html generated and uploaded");

Check warning on line 34 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L29-L34

Added lines #L29 - L34 were not covered by tests

info!("Invalidating CDN caches…");
if let Err(error) = env.invalidate_cdns(INDEX_PATH).await {
warn!("Failed to invalidate CDN caches: {error}");
}
info!("CDN caches invalidated");

Check warning on line 40 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L36-L40

Added lines #L36 - L40 were not covered by tests

info!("Finished indexing old version downloads");
Ok(())
}

Check warning on line 44 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L42-L44

Added lines #L42 - L44 were not covered by tests
}

/// Generate and upload an index.html based on the objects within the given store.
async fn generate(store: &impl ObjectStore) -> anyhow::Result<()> {
let mut context = TemplateContext::new().context("instantiating TemplateContext")?;
context
.build_from_store(store)
.await

Check warning on line 52 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L52

Added line #L52 was not covered by tests
.context("building up files from object store")?;
let index = context.into_string().context("rendering template")?;

store
.put(&"index.html".into(), index.into())
.await

Check warning on line 58 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L58

Added line #L58 was not covered by tests
.context("uploading index.html")?;

Ok(())
}

struct TemplateContext {
env: minijinja::Environment<'static>,
files: BTreeSet<File>,
}

impl TemplateContext {
fn new() -> anyhow::Result<Self> {
use minijinja::Environment;

let mut env = Environment::new();
env.add_template("index.html", include_str!("index.html.j2"))?;

Ok(Self {
env,
files: BTreeSet::new(),
})
}

async fn build_from_store(&mut self, store: &impl ObjectStore) -> anyhow::Result<()> {
let mut contents = store.list(None);
while let Some(object) = contents.try_next().await? {
match File::try_from(object) {
Ok(file) => {
self.files.insert(file);
}
Err(e) => {
warn!(?e, "ignoring file in object store");
}
}
}

Ok(())
}

fn into_string(self) -> anyhow::Result<String> {
use minijinja::context;

Ok(self.env.get_template("index.html")?.render(context! {
files => self.files,
})?)
}
}

#[derive(Serialize, Debug, Eq)]
struct File {
name: String,
size: usize,
}

impl Ord for File {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// This is intentionally reversed so that the most recent file appears at the top of the
// index.
other.name.cmp(&self.name)
}
}

impl PartialEq for File {
fn eq(&self, other: &Self) -> bool {
self.name == other.name
}

Check warning on line 124 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L122-L124

Added lines #L122 - L124 were not covered by tests
}

impl PartialOrd for File {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}

Check warning on line 130 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L128-L130

Added lines #L128 - L130 were not covered by tests
}

impl TryFrom<ObjectMeta> for File {
type Error = anyhow::Error;

fn try_from(object: ObjectMeta) -> Result<Self, Self::Error> {
match object.location.filename() {
Some(filename) if filename.ends_with(".csv") => Ok(Self {
name: filename.to_string(),
size: object.size,
}),
Some(filename) => Err(anyhow::anyhow!("ignoring non-CSV file: {filename}")),
None => Err(anyhow::anyhow!(
"cannot get filename for object: {object:?}"
)),

Check warning on line 145 in src/worker/jobs/index_version_downloads_archive/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/worker/jobs/index_version_downloads_archive/mod.rs#L143-L145

Added lines #L143 - L145 were not covered by tests
}
}
}

#[cfg(test)]
mod tests {
use googletest::prelude::*;
use insta::assert_snapshot;
use object_store::memory::InMemory;

use super::*;

#[tokio::test]
async fn test_generate() -> anyhow::Result<()> {
let store = fake_store().await?;
generate(&store).await?;

let index = store.get(&"index.html".into()).await?.bytes().await?;

// This should have overwritten the previous file of just null bytes.
assert_that!(index.iter().any(|b| *b != b'\0'), eq(true));

assert_snapshot!(std::str::from_utf8(&index)?);

Ok(())
}

#[tokio::test]
async fn test_generate_empty() -> anyhow::Result<()> {
let store = InMemory::new();
generate(&store).await?;

let index = store.get(&"index.html".into()).await?.bytes().await?;
assert_snapshot!(std::str::from_utf8(&index)?);

Ok(())
}

#[tokio::test]
async fn test_template_context() -> anyhow::Result<()> {
let store = fake_store().await?;

let mut context = TemplateContext::new()?;
context.build_from_store(&store).await?;

// Validate that only the expected date CSVs are present, in order.
let filenames: Vec<_> = context
.files
.iter()
.map(|file| file.name.as_str())
.collect();

assert_that!(
filenames,
container_eq([
"2024-08-01.csv",
"2024-07-31.csv",
"2024-07-30.csv",
"2024-07-29.csv"
]),
);

assert_snapshot!(context.into_string()?);

Ok(())
}

async fn fake_store() -> anyhow::Result<InMemory> {
let store = InMemory::new();

for (name, size) in [
// Firstly, here are some plausible fake entries in random order.
("2024-07-31.csv", 123),
("2024-07-30.csv", 124),
("2024-08-01.csv", 138),
("2024-07-29.csv", 234),
// Now for the junk that we want to ignore. Let's put in an index.
("index.html", 40),
// And a nested file that isn't CSV at all.
("foo/bar", 50),
] {
store.put(&name.into(), vec![0u8; size].into()).await?;
}

Ok(store)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
source: src/worker/jobs/index_version_downloads_archive/mod.rs
expression: "std::str::from_utf8(&index)?"
---
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>crates.io version download archives</title>
</head>
<body>
<h1>crates.io version download archives</h1>

<table>
<tbody>

<tr>
<td><a href="2024-08-01.csv">2024-08-01.csv</a></td>
<td>138</td>
</tr>

<tr>
<td><a href="2024-07-31.csv">2024-07-31.csv</a></td>
<td>123</td>
</tr>

<tr>
<td><a href="2024-07-30.csv">2024-07-30.csv</a></td>
<td>124</td>
</tr>

<tr>
<td><a href="2024-07-29.csv">2024-07-29.csv</a></td>
<td>234</td>
</tr>

</tbody>
</table>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
source: src/worker/jobs/index_version_downloads_archive/mod.rs
expression: "std::str::from_utf8(&index)?"
---
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>crates.io version download archives</title>
</head>
<body>
<h1>crates.io version download archives</h1>

<table>
<tbody>

</tbody>
</table>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
source: src/worker/jobs/index_version_downloads_archive/mod.rs
expression: context.into_string()?
---
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>crates.io version download archives</title>
</head>
<body>
<h1>crates.io version download archives</h1>

<table>
<tbody>

<tr>
<td><a href="2024-08-01.csv">2024-08-01.csv</a></td>
<td>138</td>
</tr>

<tr>
<td><a href="2024-07-31.csv">2024-07-31.csv</a></td>
<td>123</td>
</tr>

<tr>
<td><a href="2024-07-30.csv">2024-07-30.csv</a></td>
<td>124</td>
</tr>

<tr>
<td><a href="2024-07-29.csv">2024-07-29.csv</a></td>
<td>234</td>
</tr>

</tbody>
</table>
</body>
</html>
Loading

0 comments on commit d0b053b

Please sign in to comment.