Try get requests instead S3 get command

mongodb · Apr 2, 2024 · fe2418a · fe2418a
1 parent 0a9ccca
commit fe2418a
Show file tree

Hide file tree

Showing 3 changed files with 194 additions and 40 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -45,6 +45,7 @@
     "jest-mock-extended": "^2.0.2-beta2",
     "js-yaml": "^3.13.1",
     "mongodb": "^5.1.0",
+    "p-limit": "^5.0.0",
     "simple-git": "^2.45.1",
     "tsscmp": "^1.0.6",
     "validator": "^10.11.0"

diff --git a/src/job/jobHandler.ts b/src/job/jobHandler.ts
@@ -1,6 +1,8 @@
-import { GetObjectCommand, ListObjectsV2Command, S3Client } from '@aws-sdk/client-s3';
+import { ListObjectsV2Command, S3Client } from '@aws-sdk/client-s3';
 import axios, { AxiosResponse } from 'axios';
 import path from 'path';
+import pLimit from 'p-limit';
+import fs from 'fs';
 import { Payload, Job, JobStatus } from '../entities/job';
 import { JobRepository } from '../repositories/jobRepository';
 import { RepoBranchesRepository } from '../repositories/repoBranchesRepository';
@@ -18,7 +20,6 @@ import { MONOREPO_NAME } from '../monorepo/utils/monorepo-constants';
 import { nextGenHtml, nextGenParse, oasPageBuild, persistenceModule, prepareBuild } from '../commands';
 import { downloadBuildDependencies, writeFileAsync } from '../commands/src/helpers/dependency-helpers';
 import { CliCommandResponse } from '../commands/src/helpers';
-require('fs');
 
 export abstract class JobHandler {
   private _currJob: Job;
@@ -524,13 +525,36 @@ export abstract class JobHandler {
     process.env.REGRESSION = regression;
   }
 
-  // TODO-4442: Need to figure out how to split between cache and S3 files
+  private async downloadByUrl(objKey: string, destPath: string) {
+    const s3Url = `https://docs-mongodb-org-stg.s3.us-east-2.amazonaws.com/${objKey}`;
+    const maxAttempts = 3;
+
+    // Retry in case of random network issues
+    for (let i = maxAttempts; i > 0; i--) {
+      try {
+        const res = await axios.get(s3Url, { timeout: 10000, responseType: 'stream' });
+        const dirName = path.dirname(destPath);
+        this._fileSystemServices.createDirIfNotExists(dirName);
+        const dest = fs.createWriteStream(destPath);
+        res.data.pipe(dest);
+        console.log(`${objKey} is okay!`);
+      } catch (err) {
+        console.error(`Failed fetchinng ${objKey}, retrying`);
+        const delay = 1000;
+        await new Promise((resolve) => setTimeout(resolve, delay));
+      }
+    }
+  }
+
   private async downloadExistingArtifacts() {
-    await this._logger.save(this._currJob._id, 'Attempting to download existing artifacts');
+    const timerLabel = 'downloadExistingArtifacts - No limit';
+    console.time(timerLabel);
+
+    console.log('Attempting to download existing artifacts');
     const client = new S3Client({ region: 'us-east-2' });
-    const bucket = process.env.BUCKET;
+    const bucket = 'docs-mongodb-org-stg';
     if (!bucket) {
-      this._logger.error(this._currJob._id, `Missing bucket: ${bucket}`);
+      console.error(`Missing bucket: ${bucket}`);
       return;
     }
 
@@ -539,19 +563,20 @@ export abstract class JobHandler {
     // We'll need to figure out how to handle differences between prod deploy vs. content staging build, if necessary
     const s3Prefix = 'docs/docsworker-xlarge/DOP-4442/';
     const listCommand = new ListObjectsV2Command({ Bucket: bucket, Prefix: s3Prefix });
-    const repoDir = this._config.get<string>('repo_dir');
+    const repoDir = 'test-s3-fetching-repo';
     // Since the Makefiles move the path to Snooty a bit, we want to make sure we target the original, before the
     // frontend is built
-    const originalSnootyPath = `${repoDir}/../../snooty`;
-    await this._logger.save(this._currJob._id, `originalSnootyPath: ${originalSnootyPath}`);
+    const originalSnootyPath = `${repoDir}/../snooty`;
+    console.log(`originalSnootyPath: ${originalSnootyPath}`);
     const targetPublicDirectory = path.join(originalSnootyPath, '/public');
-    await this._logger.save(this._currJob._id, `targetPublicDirectory: ${targetPublicDirectory}`);
+    console.log(`targetPublicDirectory: ${targetPublicDirectory}`);
+    // Target cache directory should just be the root snooty dir since objects will have ".cache/" already included
+    const targetCacheDirectory = path.join(originalSnootyPath);
+    console.log(`targetCacheDirectory: ${targetCacheDirectory}`);
 
-    // For debugging purposes
-    let contents = '';
-    let n = 0;
+    // Need to type this
+    const keysList: any[] = [];
 
-    // NOTE: This currently does not taking into account the .cache folder
     try {
       let isTruncated = true;
 
@@ -561,47 +586,132 @@ export abstract class JobHandler {
       while (isTruncated) {
         const { Contents, IsTruncated, NextContinuationToken } = await client.send(listCommand);
         if (!Contents) {
-          this._logger.info(this._currJob._id, 'No contents');
+          console.log('No contents');
           break;
         }
 
+        console.log('Contents found');
+
         for (const obj of Contents) {
           const objKey = obj.Key;
           if (!objKey) {
             continue;
           }
 
-          const getCommand = new GetObjectCommand({ Bucket: bucket, Key: objKey });
-          const { Body: objBody } = await client.send(getCommand);
-
           // Save S3 objects to local file paths
           // Files in the local public directory should exclude path prefixes
           const localFileName = objKey.replace(s3Prefix, '');
-          const targetFilePath = path.join(targetPublicDirectory, localFileName);
-          this._logger.info(this._currJob._id, `targetFilePath: ${targetFilePath}`);
-          if (objBody) {
-            await writeFileAsync(targetFilePath, await objBody.transformToString());
+          const targetDir = objKey.includes('.cache') ? targetCacheDirectory : targetPublicDirectory;
+          const targetFilePath = path.join(targetDir, localFileName);
+
+          // Some objects are just empty directories, apparently
+          if (!objKey.endsWith('/')) {
+            keysList.push({ objKey, destPath: targetFilePath });
           }
         }
 
-        // For debugging
-        const contentsList = Contents.map((c) => {
-          n++;
-          return `${c.Key}\n`;
-        });
-        contents += contentsList;
-
         isTruncated = !!IsTruncated;
         listCommand.input.ContinuationToken = NextContinuationToken;
       }
     } catch (e) {
-      this._logger.error(this._currJob._id, e);
+      console.error(e);
     }
 
+    // Limit concurrency to avoid rate limits
+    const limit = pLimit(5);
+    const downloadPromises = keysList.map(({ objKey, destPath }) => {
+      return limit(() => this.downloadByUrl(objKey, destPath));
+    });
+
     // For debugging purposes
-    this._logger.info(this._currJob._id, contents);
+    // console.info(contents);
+    try {
+      await Promise.all(downloadPromises);
+    } catch (err) {
+      console.error(err);
+    }
+
+    console.timeEnd(timerLabel);
   }
 
+  // TODO-4442: Need to figure out how to split between cache and S3 files
+  // private async downloadExistingArtifacts() {
+  //   await this._logger.save(this._currJob._id, 'Attempting to download existing artifacts');
+  //   const client = new S3Client({ region: 'us-east-2' });
+  //   const bucket = process.env.BUCKET;
+  //   if (!bucket) {
+  //     this._logger.error(this._currJob._id, `Missing bucket: ${bucket}`);
+  //     return;
+  //   }
+
+  //   // S3 object prefix should match the path prefix that Mut uploads to for the build
+  //   // Probably want to make this an argument, but leave as a static variable for testing
+  //   // We'll need to figure out how to handle differences between prod deploy vs. content staging build, if necessary
+  //   const s3Prefix = 'docs/docsworker-xlarge/DOP-4442/';
+  //   const listCommand = new ListObjectsV2Command({ Bucket: bucket, Prefix: s3Prefix });
+  //   const repoDir = this._config.get<string>('repo_dir');
+  //   // Since the Makefiles move the path to Snooty a bit, we want to make sure we target the original, before the
+  //   // frontend is built
+  //   const originalSnootyPath = `${repoDir}/../../snooty`;
+  //   await this._logger.save(this._currJob._id, `originalSnootyPath: ${originalSnootyPath}`);
+  //   const targetPublicDirectory = path.join(originalSnootyPath, '/public');
+  //   await this._logger.save(this._currJob._id, `targetPublicDirectory: ${targetPublicDirectory}`);
+
+  //   // For debugging purposes
+  //   let contents = '';
+  //   let n = 0;
+
+  //   // NOTE: This currently does not taking into account the .cache folder
+  //   try {
+  //     let isTruncated = true;
+
+  //     this._fileSystemServices.createDirIfNotExists(targetPublicDirectory);
+
+  //     // Grab contents, and then attempt to continue, in case there are more objects
+  //     while (isTruncated) {
+  //       const { Contents, IsTruncated, NextContinuationToken } = await client.send(listCommand);
+  //       if (!Contents) {
+  //         this._logger.info(this._currJob._id, 'No contents');
+  //         break;
+  //       }
+
+  //       for (const obj of Contents) {
+  //         const objKey = obj.Key;
+  //         if (!objKey) {
+  //           continue;
+  //         }
+
+  //         const getCommand = new GetObjectCommand({ Bucket: bucket, Key: objKey });
+  //         const { Body: objBody } = await client.send(getCommand);
+
+  //         // Save S3 objects to local file paths
+  //         // Files in the local public directory should exclude path prefixes
+  //         const localFileName = objKey.replace(s3Prefix, '');
+  //         const targetFilePath = path.join(targetPublicDirectory, localFileName);
+  //         this._logger.info(this._currJob._id, `targetFilePath: ${targetFilePath}`);
+  //         if (objBody) {
+  //           await writeFileAsync(targetFilePath, await objBody.transformToString());
+  //         }
+  //       }
+
+  //       // For debugging
+  //       const contentsList = Contents.map((c) => {
+  //         n++;
+  //         return `${c.Key}\n`;
+  //       });
+  //       contents += contentsList;
+
+  //       isTruncated = !!IsTruncated;
+  //       listCommand.input.ContinuationToken = NextContinuationToken;
+  //     }
+  //   } catch (e) {
+  //     this._logger.error(this._currJob._id, e);
+  //   }
+
+  //   // For debugging purposes
+  //   this._logger.info(this._currJob._id, contents);
+  // }
+
   @throwIfJobInterupted()
   protected async buildWithMakefiles(): Promise<boolean> {
     this.cleanup();