Chaining, retries, and general scraping durability

Make the scraping infrastructure more durable since the scraping service often fails to deliver
This commit is contained in:
NMerz
2020-11-01 11:30:03 -05:00
parent 79598bf9e9
commit ee04a06e0e
21 changed files with 74 additions and 19 deletions

View File

@@ -2,11 +2,12 @@ import requests
import json
from bs4 import BeautifulSoup
import pymysql.cursors
import time
from random import randint
import re
def lambda_handler(event, context):
print(event["toScrape"])
scraper_configs = None
@@ -19,14 +20,27 @@ def lambda_handler(event, context):
}
params = (
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=" + event["toScrape"]),
("location","na"),
);
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
retry_counter = 1
while response.status_code == 500:
print("Retry #" + str(retry_counter))
retry_counter += 1
time.sleep(randint(5,20))
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
if response.status_code != 200:
print("Scraping status code: " + str(response.status_code ))
print(response.text)
soup = BeautifulSoup(response.text, "html.parser")
insert_params = []
for match in soup.find_all(id=re.compile(".*_prod_price")):