-
index.js// For more information, see https://crawlee.dev/
import { CheerioCrawler, ProxyConfiguration, Configuration,Dataset } from 'crawlee';
import { router } from './routes.js';
//const products = require('./routes.ts');
import {products} from './routes.js'
const startUrls = ['https://www.hugoboss.com/de/herren-kleidung/?start=0&sz=10'];
process.env.CRAWLEE_STORAGE_DIR = '/tmp/crawlee/storage';
export const handler = async (event) => {
console.log('start')
const hugo_bossScraper = new CheerioCrawler({
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
requestHandler: router,
// Comment this option to scrape the full website.
}, new Configuration({
persistStorage: false,
}));
await hugo_bossScraper.run(startUrls);
await Dataset.pushData(products)
console.log(products)
return {
statusCode: 200,
body: await hugo_bossScraper.getData(),
}
}
handler()
import { createCheerioRouter } from 'crawlee';
export const router = createCheerioRouter();
export const products = []
<details>
router.addDefaultHandler(async ({ request, $, enqueueLinks, crawler, log, }) => {
log.debug(`Processing ${request.url}...`);
const gender = request.url.match(/\/([^/]+)-([^/]+)\/[^/]+$/)[1];
const generalCategory = request.url.match(/\/([^/]+)-([^/]+)\/[^/]+$/)[2];
const productList = [];
console.log(`PRODUCTSAmount: ${$("[data-productposition]").length}`);
if($("[data-productposition]").length > 0){
$("[data-productposition]").each((index, element) => {
const productElement = $(element); // Cache the cheerio element for efficiency
const name = productElement.find("article a:nth-child(1)").attr("title"); // Find the first anchor tag and get its text
const href = productElement.find("article a:first").attr("href"); // Find the first anchor tag and get its href attribute
const price = productElement.find("article .pricing") // Find the first anchor tag and get its href attribute
const category = JSON.parse(productElement.find("script:nth-child(4)").text()).category
if (href) {
}
/* await hugo_bossScraper.addRequests([{
url: href,
label: 'PRODUCT',
}]);*/
productList.push(
{ url: `https://www.hugoboss.com${href}`, label: 'PRODUCT', userData: {gender: gender, category: category,}}
)
})
console.log(productList.length)
await crawler.addRequests(productList)
// const startUrl = request.url.match(/[?&]start=(\d+)/) ? parseInt(request.url.match(/[?&]sz=(\d+)/)[1], 10)+76 : null
// enqueueLinks({urls: [`https://www.hugoboss.com/de/herren-kleidung/?start=${startUrl}&sz=76`]})
}
});
</details>
router.addHandler('PRODUCT', async ({ request, $, log, crawler, }) => {
console.log(`PRODUCT: ${request.url}`);
const productId = $('.pdp-stage__accordion-style-number').text().trim().replace(/\s+/g, "")+"hugoboss";
if(!products.find(item => item.productId === productId)){
<details>
const productLine = $('.bottomsheet__logo--boss').text()
const srcImage = $('.pdp-stage__images:first-child picture:first-child img').attr('src');
const altImage = $('.pdp-stage__images:first-child picture:first-child img').attr('alt');
const widthImage = $('.pdp-stage__images:first-child picture:first-child img').attr('width');
const heightImage = $('.pdp-stage__images:first-child picture:first-child img').attr('height');
const name = $('h1.pdp-stage__header-title').text().trim()
products.push({
productId: productId,
name: name,
srcImage: srcImage,
altImage: altImage,
widthImage: widthImage,
heightImage: heightImage,
category: request.userData.category,
variants: []
})
</details>
const urls = []
$(".pdp-stage_color-selector nav a").each( (index, element) => {
const href = $(element).attr("href");
if (href) {
console.log("href"+href)
urls.push({ url:`https://www.hugoboss.com${href}` , label: 'VARIANT',userData: { data:{variantNumber: index, productId: productId, gender: request.userData.gender}} })
}
})
console.log("urls-LENGTH"+urls.length)
console.log(urls);
await crawler.addRequests(urls)
}else {
console.log("ALREADY EXISTS")
}
});
router.addHandler('VARIANT', async ({ request, $, log, }) => {
console.log(`VARIANT: ${request.url}`);
<details>
const variantId = $('.pdp-stage__accordion-style-number').text().trim().replace(/\s+/g, "")+"variant"+request.userData.variantNumber
const name = $('h1.pdp-stage__header-title').text().trim()
const link = request.url
const variantDescription = $('.pdp-stage__accordion-description').text().trim()
const position = request.userData.data.variantNumber
const variantOptions = [{content: "no_options_available"}]
const gender = request.userData.data.gender
const images = []
const available= []
const pricesShop = [{price: $('.pricing__main-price').text().match(/\d+(?:[,.]\d+)?(?:\s€)?/g), date: Date.now()}]
$(".pdp-stage__images picture").each((index, element) => {
const src = $(element).find("img").attr("src"); // Find the first anchor tag and get its text
const alt = $(element).find("img").attr("alt"); // Find the first anchor tag and get its href attribute
const width = $(element).find("img").attr("width");
const height = $(element).find("img").attr("height");
const position = index
const image = {src: src, alt: alt, width: width, height: height, position: position}
images.push(image)
})
$(".size-select__list .size-select__list-element").each((index, element) => {
const size = $(element).find("span").text().trim()
const available1 = $(element).find(".size-select__pan").length >= 1 ? false : true // Find the first anchor tag and get its href attribute
const availableObject = {size: size, available: available1}
available.push(availableObject)
})
/*
console.log( "product.request:"+request.userData.data.productId);
console.log( products);
console.log( products[0].productId === request.userData.data.productId);
console.log("indexof1")
console.log(products[products.indexOf(products.find(item => item.productId === request.userData.productId))])
console.log(products[0])
console.log("indexof2")
console.log(products[products.indexOf(products.find(item => item.productId === request.userData.data.productId))])*/
products[products.indexOf(products.find(item => item.productId === request.userData.data.productId))].variants.push({
name: name,
variantId: variantId,
link: link,
variantDescription: variantDescription,
available: available,
position: position,
variantOptions: variantOptions,
images: images,
gender: gender,
pricesShop: pricesShop
})</details>
}); npm start console output
Problem: In this example the product: "'Fein gemustertes Regular-Fit Sakko" has one Variant which gets enqued with the Link: "https://www.hugoboss.com/de/fein-gemustertes-regular-fit-sakko/hbeu50498700_413.html" (U can see it in the console.log() when u search the link) but this link will never be scraped. |
Beta Was this translation helpful? Give feedback.
Answered by
Lukas0203
Apr 30, 2024
Replies: 1 comment 6 replies
-
Could you show us the value of |
Beta Was this translation helpful? Give feedback.
6 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey, I think I found the problem. The problem was that Crawlee was not crawling a link twice. I rewrote the code to solve this problem and now it seems to work. But anyways thanks a lot for your help and the tips for formating the github post. :)