123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- const cheerio = require('cheerio');
- const rp = require('request-promise');
- const fs = require('fs');
- const BASE_URL = "https://www.skroutz.gr";
- const SHOP_URL = "/m.DasHome.10646.html";
- const queue = [];
- var _timer = null;
- function exec () {
- if(!queue.length) {
- _timer = null;
- return;
- }
- var i = queue.shift();
- console.log("Performing Query", i[2].uri);
- rp(i[2]).then((d) => i[0](d)).catch((e) => i[1](e));
- setTimeout(exec, 1000);
- }
- function httpsGet(uri) {
- let options = {
- uri,
- headers: {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
- },
- transform: (body) => cheerio.load(body)
- };
- // return rp(options);
- return new Promise((res,rej) => {
- queue.push([res, rej, options]);
- if(!_timer) {
- _timer = setTimeout(exec, 100);
- }
- });
- }
- class Crawler {
- constructor() {
- this.baseURL = BASE_URL;
- }
- getCategories() {
- return httpsGet(this.baseURL + SHOP_URL)
- .then(($) => {
- let categories = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
- let quantity = $(".super-categories .card:not(.manufacturer-card) h2 a span").toArray();
- let links = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
- categories = categories.map((i) => $(i).attr('title'));
- quantity = quantity.map((i) => $(i).text().replace('(','').replace(')','')); //remove parenthesis
- links = links.map((i) => $(i).attr('href'));
-
- let objects = categories.map((item, index) => {
- return {
- name: item,
- quantity: parseInt(quantity[index]),
- link: this.baseURL + links[index]
- };
- });
- return objects;
- })
- .catch((e) => console.log("ERROR:::::",e));
- }
- getProducts(url) {
- return httpsGet(url)
- .then(($) => {
- let cards = $("ol#sku-list li").toArray().map((item) => $(item).data('skuid'));
- const anchors = $("ol#sku-list li div.price a.js-sku-link").toArray();
- let links = anchors.map((item) => $(item).attr('href'));
- let prices = anchors.map((item) => $(item).text());
- let titles = anchors.map((item) => $(item).attr('title'));
- let result = [];
- for (let i = 0; i < cards.length; i ++) {
- result.push({
- link : this.baseURL + links[i],
- sku : cards[i],
- price: prices[i],
- title: titles[i]
- });
- }
- return result;
- })
- .catch((er) => console.log("ERROR PRODUCT GET : " , er));
- }
- getAllProducts(url, total) {
- console.log(url);
- let pages = Math.ceil(total/48);
- let res = [];
- for(let j=1;j<=pages;j++) {
- res.push(
- this.getProducts(url + "?page=" + j)
- )
- }
- return Promise.all(res).then((values) => {
- let a = [].concat.apply([], values);
- return a;
- })
- .catch((e) => console.log("ALL PRODUCTS ERROR:::::", url));
- }
- getCompanies(url) {
- console.log(url);
- return httpsGet(url)
- .then(($) => {
- let companies = $("ol#prices li").toArray();
- let names = $("ol#prices li .shop .shop-name").toArray().map((i) => $(i).text());
- let prices = $("ol#prices li .price a.product-link").toArray().map((i) => parseFloat($(i).text().replace('€', '').replace(',','.')));
- let res = [];
- for (let i=0;i<names.length;i++) {
- res.push({
- name: names[i],
- price: prices[i]
- });
- }
- return res;
- })
- }
- retrieve() {
- let result = {
- categories: [],
- skus: [],
- companies: []
- }
- return this.getCategories().then((categories) => {
- result.categories = categories;
- return Promise.all(categories.map((cat) => this.getAllProducts(cat.link, cat.quantity)));
- })
- .then((skus) => {
- result.skus = skus;
- return Promise.all(skus.map((sku) => this.getCompanies(sku.link)))
- })
- .then((companies) => {
- result.companies = companies;
- return result;
- })
- }
- }
- let crawler = new Crawler();
- crawler.retrieve().then((data) => {
- fs.writeFileSync("data", JSON.stringify(data));
- console.log("SUCCESS123123");
- console.log(data);
- console.log("SUCCESS123123");
- });
- //crawler.getCategories();
- // crawler.getAllProducts('/c/1158/kourtines/m/10646/Das-Home.html', 49);
- // crawler.getCompanies('/s/12890875/Das-Home-%CE%A3%CE%B5%CF%84-Ninjago-Lego-5008-160x260.html');
|