-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyulWA-social-media
executable file
·33 lines (27 loc) · 1.22 KB
/
yulWA-social-media
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env bash
DATE=$(date +"%Y%m%d")
LOG_FILE="/var/log/heritrix/yu-social-media-browsertrix.log"
CRAWL_CONFIG="/opt/web-archiving-cron-scripts/crawl-configs/yu-social-media.yaml"
CRAWL_DIR="/mnt/omega/web-archives/browsertrix"
COLLECTION_NAME="yu-social-media"
WORKERS=10
VERSION="1.4.2"
function start_crawl {
cd "$CRAWL_DIR" || exit
/usr/bin/docker run -v "$CRAWL_CONFIG:/app/crawl-config.yaml" -v "$CRAWL_DIR:/crawls" webrecorder/browsertrix-crawler:"$VERSION" crawl --config /app/crawl-config.yaml --collection "$COLLECTION_NAME" --allowHashUrls --workers "$WORKERS" --saveState always --statsFilename "$COLLECTION_NAME-stats-$DATE.json" "+YorkUniversityLibrariesCrawlerBot, [email protected]" > "$LOG_FILE" 2>&1
if [ $? -eq 0 ]; then
echo "[INFO] $(date) - started" >> "$LOG_FILE"
else
echo "[ERROR] $(date)" >> "$LOG_FILE"
fi
}
function stage_warcs {
cd "$CRAWL_DIR/collections/$COLLECTION_NAME/archive" || exit
rename -v 's/rec-/YU-SOCIAL-MEDIA-rec-/g' ./*.warc.gz
echo "[INFO] $(date) - renamed files" >> "$LOG_FILE"
mv -v ./*.warc.gz /mnt/omega/web-archives/import
echo "[INFO] $(date) - staged files" >> "$LOG_FILE"
}
start_crawl
stage_warcs
docker container prune -f