diff --git a/.gitignore b/.gitignore index 55ac339..4b21ca3 100644 --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,5 @@ Lambdas/Lists/target/classes/META-INF/Lists.kotlin_module Listify/app/src/main/res/raw/auths.json Lambdas/Lists/target/surefire-reports/TestInputUtils.txt Lambdas/Lists/target/surefire-reports/TEST-TestInputUtils.xml +Lambdas/Scraping/scraperConfigs.json +Lambdas/Scraping/dbConfigs.json diff --git a/Lambdas/Scraping/KohlsScraper.py b/Lambdas/Scraping/KohlsScraper.py index f01fc9d..d5058fd 100644 --- a/Lambdas/Scraping/KohlsScraper.py +++ b/Lambdas/Scraping/KohlsScraper.py @@ -1,7 +1,16 @@ import requests +import json + +import pymysql.cursors +import time + +scraper_configs = None +with open("scraperConfigs.json", "r") as scraper_configs_file: + scraper_configs = json.load(scraper_configs_file) headers = { - "apikey": "" + "apikey": scraper_configs["apikey"] + } params = ( @@ -9,29 +18,54 @@ params = ( ("location","na"), ); -response = requests.get('https://app.zenscrape.com/api/v1/get', headers=headers, params=params); +response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params); print(response.text) -soup = BeautifulSoup(response.text, 'html.parser') +soup = BeautifulSoup(response.text, "html.parser") + +insert_params = [] for match in soup.find_all(id=re.compile(".*_prod_price")): - price = None - description = "" - match_split = match.text.split() - for section in match_split: - if '$' in section: - description = "" - if price == None: - price = section - continue - if ('(' in section) or (')' in section): - continue - description += section + " " - description = description.strip() - imgUrl = "" - imgUrlBase = 'https://media.kohlsimg.com/is/image/kohls/' - for prior in match.previous_siblings: - if imgUrlBase in str(prior): - imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0] - print(price + " for: " + description + " @: " + imgUrl) + price = None + description = "" + match_split = match.text.split() + for section in match_split: + if '$' in section: + description = "" + if price == None: + price = section + continue + if ('(' in section) or (')' in section): + continue + description += section + " " + description = description.strip() + imgUrl = "" + imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/" + for prior in match.previous_siblings: + if imgUrlBase in str(prior): + imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0] + print(price + " for: " + description + " @: " + imgUrl) + insert_params.append((3, description, float(price.split('$')[1]), imgUrl)) +db_configs = None +with open("dbConfigs.json", "r") as db_configs_file: + db_configs = json.load(db_configs_file) + + +connection = pymysql.connect(host=db_configs["host"], + user=db_configs["user"], + password=db_configs["password"], + db=db_configs["db_name"], + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + +try: + with connection.cursor() as cursor: + PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);" + cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params) + connection.commit() +except Exception as e: + print(e) + traceback.print_exc() +finally: + connection.close()