Listify/Lambdas/Scraping/KohlsScraper.py
NMerz 79598bf9e9 Lambdaify Kohls scraping
Move to Lambda function setup and add orchestration and word lists for full runs. Credit for the word lists goes to: http://www.desiquintans.com/nounlist
2020-10-18 17:51:08 -04:00

81 lines
2.6 KiB
Python

import requests
import json
from bs4 import BeautifulSoup
import pymysql.cursors
import time
import re
def lambda_handler(event, context):
print(event["toScrape"])
scraper_configs = None
with open("scraperConfigs.json", "r") as scraper_configs_file:
scraper_configs = json.load(scraper_configs_file)
headers = {
"apikey": scraper_configs["apikey"]
}
params = (
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
("location","na"),
);
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
soup = BeautifulSoup(response.text, "html.parser")
insert_params = []
for match in soup.find_all(id=re.compile(".*_prod_price")):
price = None
description = ""
match_split = match.text.split()
for section in match_split:
if '$' in section:
description = ""
if price == None:
price = section
continue
if ('(' in section) or (')' in section):
continue
description += section + " "
description = description.strip()
imgUrl = ""
imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
for prior in match.previous_siblings:
if imgUrlBase in str(prior):
imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
print(price + " for: " + description + " @: " + imgUrl)
insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
db_configs = None
with open("dbConfigs.json", "r") as db_configs_file:
db_configs = json.load(db_configs_file)
connection = pymysql.connect(host=db_configs["host"],
user=db_configs["user"],
password=db_configs["password"],
db=db_configs["db_name"],
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
connection.commit()
except Exception as e:
print(e)
traceback.print_exc()
finally:
connection.close()
return {
'statusCode': 200,
'body': 'Scraped: ' + event["toScrape"]
}