123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- const cheerio = require('cheerio');
- const request = require('request');
- const rp = require('request-promise');
- const fs = require('fs');
- const Actions = require('klapi/systems/ActionSystem');
- // const rp = rpi.defaults({'proxy': "http://anonymous:[email protected]:54325"})
- const BASE_URL = "https://www.skroutz.gr";
- const SHOP_URL = "/m.DasHome.10646.html";
- const queue = [];
- var _timer = null;
- function exec () {
- if(!queue.length) {
- _timer = null;
- return;
- }
- var i = queue.shift();
- console.log("Performing Query", i[2].uri);
- rp(i[2]).then((d) => i[0](d)).catch((e) => i[1](e));
- setTimeout(exec, 4000 + (2000* Math.random()));
- }
- function httpsGet(uri) {
- let options = {
- uri,
- tunnel: false,
- // proxy: "http://93.109.241.246:54325", tunnel:false,
- headers: {
- 'user-agent': 'Chrome/74.0.3729.169'
- },
- transform: (body) => cheerio.load(body)
- };
- // return rp(options);
- return new Promise((res,rej) => {
- queue.push([res, rej, options]);
- if(!_timer) {
- _timer = setTimeout(exec, 100);
- }
- });
- }
- class Crawler {
- constructor() {
- this.baseURL = BASE_URL;
- }
- getCategories(limit = 9999) {
- return httpsGet(this.baseURL + SHOP_URL)
- .then(($) => {
- let categories = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
- let quantity = $(".super-categories .card:not(.manufacturer-card) h2 a span").toArray();
- let links = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
- categories = categories.map((i) => $(i).attr('title'));
- quantity = quantity.map((i) => $(i).text().replace('(','').replace(')','')); //remove parenthesis
- links = links.map((i) => $(i).attr('href'));
-
- let objects = categories.slice(0,limit).map((item, index) => {
- let obj = {
- name: item,
- quantity: parseInt(quantity[index]),
- link: this.baseURL + links[index]
- };
- return obj;
- });
- Actions.emit('crawlCategory', objects);
- return objects;
- })
- .catch((e) => console.log("ERROR:::::",e));
- }
- getProducts(url, category) {
- return httpsGet(url)
- .then(($) => {
- let cards = $("ol#sku-list li").toArray().map((item) => $(item).data('skuid'));
- const anchors = $("ol#sku-list li div.price a.js-sku-link").toArray();
- let links = anchors.map((item) => $(item).attr('href'));
- let prices = anchors.map((item) => $(item).text());
- let titles = anchors.map((item) => $(item).attr('title'));
- let result = [];
- for (let i = 0; i < cards.length; i ++) {
- if(!links[i] || links[i] === "undefined")continue;
- result.push({
- link : this.baseURL + links[i],
- sku : cards[i],
- price: parseFloat(prices[i]),
- title: titles[i],
- category
- });
- }
- Actions.emit('crawlSKU', result);
- return result;
- })
- .catch((er) => console.log("ERROR PRODUCT GET : " , er));
- }
- getAllProducts(url, total, name) {
- // console.log(url);
- if(url === undefined) return Promise.resolve([]);
- let pages = Math.ceil(total/48);
- let res = [];
- for(let j=1;j<=pages;j++) {
- res.push(
- this.getProducts(url + "?page=" + j, name)
- )
- }
- return Promise.all(res).then((values) => {
- let a = [].concat.apply([], values);
- return a;
- })
- .catch((e) => console.log("ALL PRODUCTS ERROR:::::", url));
- }
- getCompanies(url, sku) {
- // console.log(url);
- if(url === undefined) return Promise.resolve([]);
- return httpsGet(url)
- .then(($) => {
- let companies = $("ol#prices li").toArray();
- let names = $("ol#prices li .shop .shop-name").toArray().map((i) => $(i).text());
- let prices = $("ol#prices li .price a.product-link").toArray().map((i) => parseFloat($(i).text().replace('€', '').replace(',','.')));
- let res = [];
- for (let i=0;i<names.length;i++) {
- res.push({
- name: names[i],
- price: parseFloat(prices[i]),
- sku,
- SKUName: sku + names[i]
- });
- }
- Actions.emit('crawlProduct', res);
- return res;
- })
- }
- retrieve(limit = 9999) {
- let result = {
- categories: [],
- skus: [],
- companies: []
- }
- return this.getCategories(limit).then((categories) => {
- result.categories = categories;
- return Promise.all(categories.map((cat) => {
- // console.log("CATEGORY",cat);
- return this.getAllProducts(cat.link, cat.quantity, cat.name)
- })).then((values) => {
- let a = [].concat.apply([], values);
- return a;
- });
- })
- .then((skus) => {
- result.skus = skus;
- return Promise.all(skus.map((sku) => this.getCompanies(sku.link, sku.sku))).then((values) => {
- let a = [].concat.apply([], values);
- return a;
- })
- })
- .then((companies) => {
- result.companies = companies;
- return result;
- })
- }
- }
- // let crawler = new Crawler();
- // crawler.retrieve().then((data) => {
- // fs.writeFileSync("data", JSON.stringify(data));
- // console.log("SUCCESS123123");
- // console.log(data);
- // console.log("SUCCESS123123");
- // });
- //crawler.getCategories();
- // crawler.getAllProducts(BASE_URL + '/c/2729/anostromata/m/10646/Das-Home.html', 49);
- // crawler.getCompanies('/s/12890875/Das-Home-%CE%A3%CE%B5%CF%84-Ninjago-Lego-5008-160x260.html');
- module.exports = new Crawler();
|