Crawler.js 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. const cheerio = require('cheerio');
  2. const request = require('request');
  3. const rp = require('request-promise');
  4. const fs = require('fs');
  5. const Actions = require('klapi/systems/ActionSystem');
  6. // const rp = rpi.defaults({'proxy': "http://anonymous:[email protected]:54325"})
  7. const BASE_URL = "https://www.skroutz.gr";
  8. const SHOP_URL = "/m.DasHome.10646.html";
  9. const queue = [];
  10. var _timer = null;
  11. function exec () {
  12. if(!queue.length) {
  13. _timer = null;
  14. return;
  15. }
  16. var i = queue.shift();
  17. console.log("Performing Query", i[2].uri);
  18. rp(i[2]).then((d) => i[0](d)).catch((e) => i[1](e));
  19. setTimeout(exec, 4000 + (2000* Math.random()));
  20. }
  21. function httpsGet(uri) {
  22. let options = {
  23. uri,
  24. tunnel: false,
  25. // proxy: "http://93.109.241.246:54325", tunnel:false,
  26. headers: {
  27. 'user-agent': 'Chrome/74.0.3729.169'
  28. },
  29. transform: (body) => cheerio.load(body)
  30. };
  31. // return rp(options);
  32. return new Promise((res,rej) => {
  33. queue.push([res, rej, options]);
  34. if(!_timer) {
  35. _timer = setTimeout(exec, 100);
  36. }
  37. });
  38. }
  39. class Crawler {
  40. constructor() {
  41. this.baseURL = BASE_URL;
  42. }
  43. getCategories(limit = 9999) {
  44. return httpsGet(this.baseURL + SHOP_URL)
  45. .then(($) => {
  46. let categories = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
  47. let quantity = $(".super-categories .card:not(.manufacturer-card) h2 a span").toArray();
  48. let links = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
  49. categories = categories.map((i) => $(i).attr('title'));
  50. quantity = quantity.map((i) => $(i).text().replace('(','').replace(')','')); //remove parenthesis
  51. links = links.map((i) => $(i).attr('href'));
  52. let objects = categories.slice(0,limit).map((item, index) => {
  53. let obj = {
  54. name: item,
  55. quantity: parseInt(quantity[index]),
  56. link: this.baseURL + links[index]
  57. };
  58. return obj;
  59. });
  60. Actions.emit('crawlCategory', objects);
  61. return objects;
  62. })
  63. .catch((e) => console.log("ERROR:::::",e));
  64. }
  65. getProducts(url, category) {
  66. return httpsGet(url)
  67. .then(($) => {
  68. let cards = $("ol#sku-list li").toArray().map((item) => $(item).data('skuid'));
  69. const anchors = $("ol#sku-list li div.price a.js-sku-link").toArray();
  70. let links = anchors.map((item) => $(item).attr('href'));
  71. let prices = anchors.map((item) => $(item).text());
  72. let titles = anchors.map((item) => $(item).attr('title'));
  73. let result = [];
  74. for (let i = 0; i < cards.length; i ++) {
  75. if(!links[i] || links[i] === "undefined")continue;
  76. result.push({
  77. link : this.baseURL + links[i],
  78. sku : cards[i],
  79. price: parseFloat(prices[i]),
  80. title: titles[i],
  81. category
  82. });
  83. }
  84. Actions.emit('crawlSKU', result);
  85. return result;
  86. })
  87. .catch((er) => console.log("ERROR PRODUCT GET : " , er));
  88. }
  89. getAllProducts(url, total, name) {
  90. // console.log(url);
  91. if(url === undefined) return Promise.resolve([]);
  92. let pages = Math.ceil(total/48);
  93. let res = [];
  94. for(let j=1;j<=pages;j++) {
  95. res.push(
  96. this.getProducts(url + "?page=" + j, name)
  97. )
  98. }
  99. return Promise.all(res).then((values) => {
  100. let a = [].concat.apply([], values);
  101. return a;
  102. })
  103. .catch((e) => console.log("ALL PRODUCTS ERROR:::::", url));
  104. }
  105. getCompanies(url, sku) {
  106. // console.log(url);
  107. if(url === undefined) return Promise.resolve([]);
  108. return httpsGet(url)
  109. .then(($) => {
  110. let companies = $("ol#prices li").toArray();
  111. let names = $("ol#prices li .shop .shop-name").toArray().map((i) => $(i).text());
  112. let prices = $("ol#prices li .price a.product-link").toArray().map((i) => parseFloat($(i).text().replace('€', '').replace(',','.')));
  113. let res = [];
  114. for (let i=0;i<names.length;i++) {
  115. res.push({
  116. name: names[i],
  117. price: parseFloat(prices[i]),
  118. sku,
  119. SKUName: sku + names[i]
  120. });
  121. }
  122. Actions.emit('crawlProduct', res);
  123. return res;
  124. })
  125. }
  126. retrieve(limit = 9999) {
  127. let result = {
  128. categories: [],
  129. skus: [],
  130. companies: []
  131. }
  132. return this.getCategories(limit).then((categories) => {
  133. result.categories = categories;
  134. return Promise.all(categories.map((cat) => {
  135. // console.log("CATEGORY",cat);
  136. return this.getAllProducts(cat.link, cat.quantity, cat.name)
  137. })).then((values) => {
  138. let a = [].concat.apply([], values);
  139. return a;
  140. });
  141. })
  142. .then((skus) => {
  143. result.skus = skus;
  144. return Promise.all(skus.map((sku) => this.getCompanies(sku.link, sku.sku))).then((values) => {
  145. let a = [].concat.apply([], values);
  146. return a;
  147. })
  148. })
  149. .then((companies) => {
  150. result.companies = companies;
  151. return result;
  152. })
  153. }
  154. }
  155. // let crawler = new Crawler();
  156. // crawler.retrieve().then((data) => {
  157. // fs.writeFileSync("data", JSON.stringify(data));
  158. // console.log("SUCCESS123123");
  159. // console.log(data);
  160. // console.log("SUCCESS123123");
  161. // });
  162. //crawler.getCategories();
  163. // crawler.getAllProducts(BASE_URL + '/c/2729/anostromata/m/10646/Das-Home.html', 49);
  164. // crawler.getCompanies('/s/12890875/Das-Home-%CE%A3%CE%B5%CF%84-Ninjago-Lego-5008-160x260.html');
  165. module.exports = new Crawler();