Listify/Lambdas/Scraping/KohlsScraper.py
NMerz ee04a06e0e Chaining, retries, and general scraping durability
Make the scraping infrastructure more durable since the scraping service often fails to deliver
2020-11-01 11:30:03 -05:00

95 lines
3.0 KiB
Python

import requests
import json
from bs4 import BeautifulSoup
import pymysql.cursors
import time
from random import randint
import re
def lambda_handler(event, context):
print(event["toScrape"])
scraper_configs = None
with open("scraperConfigs.json", "r") as scraper_configs_file:
scraper_configs = json.load(scraper_configs_file)
headers = {
"apikey": scraper_configs["apikey"]
}
params = (
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=" + event["toScrape"]),
("location","na"),
);
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
retry_counter = 1
while response.status_code == 500:
print("Retry #" + str(retry_counter))
retry_counter += 1
time.sleep(randint(5,20))
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
if response.status_code != 200:
print("Scraping status code: " + str(response.status_code ))
print(response.text)
soup = BeautifulSoup(response.text, "html.parser")
insert_params = []
for match in soup.find_all(id=re.compile(".*_prod_price")):
price = None
description = ""
match_split = match.text.split()
for section in match_split:
if '$' in section:
description = ""
if price == None:
price = section
continue
if ('(' in section) or (')' in section):
continue
description += section + " "
description = description.strip()
imgUrl = ""
imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
for prior in match.previous_siblings:
if imgUrlBase in str(prior):
imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
print(price + " for: " + description + " @: " + imgUrl)
insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
db_configs = None
with open("dbConfigs.json", "r") as db_configs_file:
db_configs = json.load(db_configs_file)
connection = pymysql.connect(host=db_configs["host"],
user=db_configs["user"],
password=db_configs["password"],
db=db_configs["db_name"],
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
connection.commit()
except Exception as e:
print(e)
traceback.print_exc()
finally:
connection.close()
return {
'statusCode': 200,
'body': 'Scraped: ' + event["toScrape"]
}