main.js 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. const cheerio = require('cheerio');
  2. const rp = require('request-promise');
  3. const fs = require('fs');
  4. const BASE_URL = "https://www.skroutz.gr";
  5. const SHOP_URL = "/m.DasHome.10646.html";
  6. const queue = [];
  7. var _timer = null;
  8. function exec () {
  9. if(!queue.length) {
  10. _timer = null;
  11. return;
  12. }
  13. var i = queue.shift();
  14. console.log("Performing Query", i[2].uri);
  15. rp(i[2]).then((d) => i[0](d)).catch((e) => i[1](e));
  16. setTimeout(exec, 1000);
  17. }
  18. function httpsGet(uri) {
  19. let options = {
  20. uri,
  21. headers: {
  22. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
  23. },
  24. transform: (body) => cheerio.load(body)
  25. };
  26. // return rp(options);
  27. return new Promise((res,rej) => {
  28. queue.push([res, rej, options]);
  29. if(!_timer) {
  30. _timer = setTimeout(exec, 100);
  31. }
  32. });
  33. }
  34. class Crawler {
  35. constructor() {
  36. this.baseURL = BASE_URL;
  37. }
  38. getCategories() {
  39. return httpsGet(this.baseURL + SHOP_URL)
  40. .then(($) => {
  41. let categories = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
  42. let quantity = $(".super-categories .card:not(.manufacturer-card) h2 a span").toArray();
  43. let links = $(".super-categories .card:not(.manufacturer-card) h2 a").toArray();
  44. categories = categories.map((i) => $(i).attr('title'));
  45. quantity = quantity.map((i) => $(i).text().replace('(','').replace(')','')); //remove parenthesis
  46. links = links.map((i) => $(i).attr('href'));
  47. let objects = categories.map((item, index) => {
  48. return {
  49. name: item,
  50. quantity: parseInt(quantity[index]),
  51. link: this.baseURL + links[index]
  52. };
  53. });
  54. return objects;
  55. })
  56. .catch((e) => console.log("ERROR:::::",e));
  57. }
  58. getProducts(url) {
  59. return httpsGet(url)
  60. .then(($) => {
  61. let cards = $("ol#sku-list li").toArray().map((item) => $(item).data('skuid'));
  62. const anchors = $("ol#sku-list li div.price a.js-sku-link").toArray();
  63. let links = anchors.map((item) => $(item).attr('href'));
  64. let prices = anchors.map((item) => $(item).text());
  65. let titles = anchors.map((item) => $(item).attr('title'));
  66. let result = [];
  67. for (let i = 0; i < cards.length; i ++) {
  68. result.push({
  69. link : this.baseURL + links[i],
  70. sku : cards[i],
  71. price: prices[i],
  72. title: titles[i]
  73. });
  74. }
  75. return result;
  76. })
  77. .catch((er) => console.log("ERROR PRODUCT GET : " , er));
  78. }
  79. getAllProducts(url, total) {
  80. console.log(url);
  81. let pages = Math.ceil(total/48);
  82. let res = [];
  83. for(let j=1;j<=pages;j++) {
  84. res.push(
  85. this.getProducts(url + "?page=" + j)
  86. )
  87. }
  88. return Promise.all(res).then((values) => {
  89. let a = [].concat.apply([], values);
  90. return a;
  91. })
  92. .catch((e) => console.log("ALL PRODUCTS ERROR:::::", url));
  93. }
  94. getCompanies(url) {
  95. console.log(url);
  96. return httpsGet(url)
  97. .then(($) => {
  98. let companies = $("ol#prices li").toArray();
  99. let names = $("ol#prices li .shop .shop-name").toArray().map((i) => $(i).text());
  100. let prices = $("ol#prices li .price a.product-link").toArray().map((i) => parseFloat($(i).text().replace('€', '').replace(',','.')));
  101. let res = [];
  102. for (let i=0;i<names.length;i++) {
  103. res.push({
  104. name: names[i],
  105. price: prices[i]
  106. });
  107. }
  108. return res;
  109. })
  110. }
  111. retrieve() {
  112. let result = {
  113. categories: [],
  114. skus: [],
  115. companies: []
  116. }
  117. return this.getCategories().then((categories) => {
  118. result.categories = categories;
  119. return Promise.all(categories.map((cat) => this.getAllProducts(cat.link, cat.quantity)));
  120. })
  121. .then((skus) => {
  122. result.skus = skus;
  123. return Promise.all(skus.map((sku) => this.getCompanies(sku.link)))
  124. })
  125. .then((companies) => {
  126. result.companies = companies;
  127. return result;
  128. })
  129. }
  130. }
  131. let crawler = new Crawler();
  132. crawler.retrieve().then((data) => {
  133. fs.writeFileSync("data", JSON.stringify(data));
  134. console.log("SUCCESS123123");
  135. console.log(data);
  136. console.log("SUCCESS123123");
  137. });
  138. //crawler.getCategories();
  139. // crawler.getAllProducts('/c/1158/kourtines/m/10646/Das-Home.html', 49);
  140. // crawler.getCompanies('/s/12890875/Das-Home-%CE%A3%CE%B5%CF%84-Ninjago-Lego-5008-160x260.html');