Lambdaify Kohls scraping

Move to Lambda function setup and add orchestration and word lists for full runs. Credit for the word lists goes to: http://www.desiquintans.com/nounlist
2026-03-10 18:55:03 +00:00 · 2020-10-18 17:51:08 -04:00
parent 7dbea8f805
commit 79598bf9e9
9 changed files with 6927 additions and 55 deletions
--- a/Lambdas/Scraping/KohlsScraper.py
+++ b/Lambdas/Scraping/KohlsScraper.py
@@ -1,71 +1,80 @@
 import requests
 import json
+from bs4 import BeautifulSoup
+

 import pymysql.cursors
 import time
+import re

-scraper_configs = None
-with open("scraperConfigs.json", "r") as scraper_configs_file:
-    scraper_configs = json.load(scraper_configs_file)
+def lambda_handler(event, context):
+    print(event["toScrape"])
+    scraper_configs = None
+    with open("scraperConfigs.json", "r") as scraper_configs_file:
+        scraper_configs = json.load(scraper_configs_file)

-headers = { 
-    "apikey": scraper_configs["apikey"]
-    
-}
+    headers = {
+        "apikey": scraper_configs["apikey"]
+        
+    }

-params = (
-   ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=shoes"),
-   ("location","na"),
-);
+    params = (
+       ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
+       ("location","na"),
+    );

-response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params);
-print(response.text)
+    response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)

-soup = BeautifulSoup(response.text, "html.parser")
+    soup = BeautifulSoup(response.text, "html.parser")

-insert_params = []
+    insert_params = []

-for match in soup.find_all(id=re.compile(".*_prod_price")):
-    price = None
-    description = ""
-    match_split = match.text.split()
-    for section in match_split:
-        if '$' in section:
-            description = ""
-            if price == None:
-                price = section
-            continue
-        if ('(' in section) or (')' in section):
-            continue
-        description += section + " "
-    description = description.strip()
-    imgUrl = ""
-    imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
-    for prior in match.previous_siblings:
-        if imgUrlBase in str(prior):
-            imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
-    print(price + " for: " + description + " @: " + imgUrl)
-    insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
+    for match in soup.find_all(id=re.compile(".*_prod_price")):
+        price = None
+        description = ""
+        match_split = match.text.split()
+        for section in match_split:
+            if '$' in section:
+                description = ""
+                if price == None:
+                    price = section
+                continue
+            if ('(' in section) or (')' in section):
+                continue
+            description += section + " "
+        description = description.strip()
+        imgUrl = ""
+        imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
+        for prior in match.previous_siblings:
+            if imgUrlBase in str(prior):
+                imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
+        print(price + " for: " + description + " @: " + imgUrl)
+        insert_params.append((3, description, float(price.split('$')[1]), imgUrl))

-db_configs = None
-with open("dbConfigs.json", "r") as db_configs_file:
-    db_configs = json.load(db_configs_file)
+    db_configs = None
+    with open("dbConfigs.json", "r") as db_configs_file:
+        db_configs = json.load(db_configs_file)


-connection = pymysql.connect(host=db_configs["host"],
-                             user=db_configs["user"],
-                             password=db_configs["password"],
-                             db=db_configs["db_name"],
-                             charset='utf8mb4',
-                             cursorclass=pymysql.cursors.DictCursor)
+    connection = pymysql.connect(host=db_configs["host"],
+                                 user=db_configs["user"],
+                                 password=db_configs["password"],
+                                 db=db_configs["db_name"],
+                                 charset='utf8mb4',
+                                 cursorclass=pymysql.cursors.DictCursor)

-try:
-    with connection.cursor() as cursor:
-        PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
-        cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
-        connection.commit()
-except Exception as e:
-    print(e)
-    traceback.print_exc()
-finally:
-    connection.close()
+    try:
+        with connection.cursor() as cursor:
+            PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
+            cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
+            connection.commit()
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+    finally:
+        connection.close()
+
+    return {
+        'statusCode': 200,
+        'body': 'Scraped: ' + event["toScrape"]
+    }