Chaining, retries, and general scraping durability

Make the scraping infrastructure more durable since the scraping service often fails to deliver
2026-03-10 18:55:03 +00:00 · 2020-11-01 11:30:03 -05:00
parent 79598bf9e9
commit ee04a06e0e
21 changed files with 74 additions and 19 deletions
--- a/Lambdas/Scraping/KohlsScraper.py
+++ b/Lambdas/Scraping/KohlsScraper.py
@@ -2,11 +2,12 @@ import requests
 import json
 from bs4 import BeautifulSoup

-
 import pymysql.cursors
 import time
+from random import randint
 import re

+
 def lambda_handler(event, context):
    print(event["toScrape"])
    scraper_configs = None
@@ -19,14 +20,27 @@ def lambda_handler(event, context):
    }

    params = (
-       ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
+       ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=" + event["toScrape"]),
       ("location","na"),
    );

    response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
-
+    
+    retry_counter = 1
+    while response.status_code == 500:
+        print("Retry #" + str(retry_counter))
+        retry_counter += 1
+        time.sleep(randint(5,20))
+        response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
+        
+    if response.status_code != 200:
+        print("Scraping status code: " + str(response.status_code ))
+        print(response.text)
+        
+    
    soup = BeautifulSoup(response.text, "html.parser")

+        
    insert_params = []

    for match in soup.find_all(id=re.compile(".*_prod_price")):