mirror of
https://github.com/ClaytonWWilson/Listify.git
synced 2026-03-10 18:55:03 +00:00
Lambdaify Kohls scraping
Move to Lambda function setup and add orchestration and word lists for full runs. Credit for the word lists goes to: http://www.desiquintans.com/nounlist
This commit is contained in:
@@ -1,71 +1,80 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
import pymysql.cursors
|
||||
import time
|
||||
import re
|
||||
|
||||
scraper_configs = None
|
||||
with open("scraperConfigs.json", "r") as scraper_configs_file:
|
||||
scraper_configs = json.load(scraper_configs_file)
|
||||
def lambda_handler(event, context):
|
||||
print(event["toScrape"])
|
||||
scraper_configs = None
|
||||
with open("scraperConfigs.json", "r") as scraper_configs_file:
|
||||
scraper_configs = json.load(scraper_configs_file)
|
||||
|
||||
headers = {
|
||||
"apikey": scraper_configs["apikey"]
|
||||
|
||||
}
|
||||
headers = {
|
||||
"apikey": scraper_configs["apikey"]
|
||||
|
||||
}
|
||||
|
||||
params = (
|
||||
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=shoes"),
|
||||
("location","na"),
|
||||
);
|
||||
params = (
|
||||
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
|
||||
("location","na"),
|
||||
);
|
||||
|
||||
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params);
|
||||
print(response.text)
|
||||
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
insert_params = []
|
||||
insert_params = []
|
||||
|
||||
for match in soup.find_all(id=re.compile(".*_prod_price")):
|
||||
price = None
|
||||
description = ""
|
||||
match_split = match.text.split()
|
||||
for section in match_split:
|
||||
if '$' in section:
|
||||
description = ""
|
||||
if price == None:
|
||||
price = section
|
||||
continue
|
||||
if ('(' in section) or (')' in section):
|
||||
continue
|
||||
description += section + " "
|
||||
description = description.strip()
|
||||
imgUrl = ""
|
||||
imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
|
||||
for prior in match.previous_siblings:
|
||||
if imgUrlBase in str(prior):
|
||||
imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
|
||||
print(price + " for: " + description + " @: " + imgUrl)
|
||||
insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
|
||||
for match in soup.find_all(id=re.compile(".*_prod_price")):
|
||||
price = None
|
||||
description = ""
|
||||
match_split = match.text.split()
|
||||
for section in match_split:
|
||||
if '$' in section:
|
||||
description = ""
|
||||
if price == None:
|
||||
price = section
|
||||
continue
|
||||
if ('(' in section) or (')' in section):
|
||||
continue
|
||||
description += section + " "
|
||||
description = description.strip()
|
||||
imgUrl = ""
|
||||
imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
|
||||
for prior in match.previous_siblings:
|
||||
if imgUrlBase in str(prior):
|
||||
imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
|
||||
print(price + " for: " + description + " @: " + imgUrl)
|
||||
insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
|
||||
|
||||
db_configs = None
|
||||
with open("dbConfigs.json", "r") as db_configs_file:
|
||||
db_configs = json.load(db_configs_file)
|
||||
db_configs = None
|
||||
with open("dbConfigs.json", "r") as db_configs_file:
|
||||
db_configs = json.load(db_configs_file)
|
||||
|
||||
|
||||
connection = pymysql.connect(host=db_configs["host"],
|
||||
user=db_configs["user"],
|
||||
password=db_configs["password"],
|
||||
db=db_configs["db_name"],
|
||||
charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
connection = pymysql.connect(host=db_configs["host"],
|
||||
user=db_configs["user"],
|
||||
password=db_configs["password"],
|
||||
db=db_configs["db_name"],
|
||||
charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
|
||||
cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
|
||||
connection.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
connection.close()
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
|
||||
cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
|
||||
connection.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
connection.close()
|
||||
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'body': 'Scraped: ' + event["toScrape"]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user