mirror of
https://github.com/ClaytonWWilson/Listify.git
synced 2025-12-13 01:38:47 +00:00
Lambdaify Kohls scraping
Move to Lambda function setup and add orchestration and word lists for full runs. Credit for the word lists goes to: http://www.desiquintans.com/nounlist
This commit is contained in:
parent
7dbea8f805
commit
79598bf9e9
1
.gitignore
vendored
1
.gitignore
vendored
@ -92,3 +92,4 @@ Lambdas/Lists/target/surefire-reports/TestInputUtils.txt
|
||||
Lambdas/Lists/target/surefire-reports/TEST-TestInputUtils.xml
|
||||
Lambdas/Scraping/scraperConfigs.json
|
||||
Lambdas/Scraping/dbConfigs.json
|
||||
Lambdas/Scraping/artifacts/*
|
||||
|
||||
@ -1,71 +1,80 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
import pymysql.cursors
|
||||
import time
|
||||
import re
|
||||
|
||||
scraper_configs = None
|
||||
with open("scraperConfigs.json", "r") as scraper_configs_file:
|
||||
scraper_configs = json.load(scraper_configs_file)
|
||||
def lambda_handler(event, context):
|
||||
print(event["toScrape"])
|
||||
scraper_configs = None
|
||||
with open("scraperConfigs.json", "r") as scraper_configs_file:
|
||||
scraper_configs = json.load(scraper_configs_file)
|
||||
|
||||
headers = {
|
||||
"apikey": scraper_configs["apikey"]
|
||||
|
||||
}
|
||||
headers = {
|
||||
"apikey": scraper_configs["apikey"]
|
||||
|
||||
}
|
||||
|
||||
params = (
|
||||
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=shoes"),
|
||||
("location","na"),
|
||||
);
|
||||
params = (
|
||||
("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
|
||||
("location","na"),
|
||||
);
|
||||
|
||||
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params);
|
||||
print(response.text)
|
||||
response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
insert_params = []
|
||||
insert_params = []
|
||||
|
||||
for match in soup.find_all(id=re.compile(".*_prod_price")):
|
||||
price = None
|
||||
description = ""
|
||||
match_split = match.text.split()
|
||||
for section in match_split:
|
||||
if '$' in section:
|
||||
description = ""
|
||||
if price == None:
|
||||
price = section
|
||||
continue
|
||||
if ('(' in section) or (')' in section):
|
||||
continue
|
||||
description += section + " "
|
||||
description = description.strip()
|
||||
imgUrl = ""
|
||||
imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
|
||||
for prior in match.previous_siblings:
|
||||
if imgUrlBase in str(prior):
|
||||
imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
|
||||
print(price + " for: " + description + " @: " + imgUrl)
|
||||
insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
|
||||
for match in soup.find_all(id=re.compile(".*_prod_price")):
|
||||
price = None
|
||||
description = ""
|
||||
match_split = match.text.split()
|
||||
for section in match_split:
|
||||
if '$' in section:
|
||||
description = ""
|
||||
if price == None:
|
||||
price = section
|
||||
continue
|
||||
if ('(' in section) or (')' in section):
|
||||
continue
|
||||
description += section + " "
|
||||
description = description.strip()
|
||||
imgUrl = ""
|
||||
imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
|
||||
for prior in match.previous_siblings:
|
||||
if imgUrlBase in str(prior):
|
||||
imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
|
||||
print(price + " for: " + description + " @: " + imgUrl)
|
||||
insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
|
||||
|
||||
db_configs = None
|
||||
with open("dbConfigs.json", "r") as db_configs_file:
|
||||
db_configs = json.load(db_configs_file)
|
||||
db_configs = None
|
||||
with open("dbConfigs.json", "r") as db_configs_file:
|
||||
db_configs = json.load(db_configs_file)
|
||||
|
||||
|
||||
connection = pymysql.connect(host=db_configs["host"],
|
||||
user=db_configs["user"],
|
||||
password=db_configs["password"],
|
||||
db=db_configs["db_name"],
|
||||
charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
connection = pymysql.connect(host=db_configs["host"],
|
||||
user=db_configs["user"],
|
||||
password=db_configs["password"],
|
||||
db=db_configs["db_name"],
|
||||
charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
|
||||
cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
|
||||
connection.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
connection.close()
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
|
||||
cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
|
||||
connection.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
connection.close()
|
||||
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'body': 'Scraped: ' + event["toScrape"]
|
||||
}
|
||||
|
||||
8
Lambdas/Scraping/buildKohlsZip.sh
Normal file
8
Lambdas/Scraping/buildKohlsZip.sh
Normal file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
#Currently to be run only from the Scraping directory
|
||||
OLDPWD=$(pwd)
|
||||
cd build/
|
||||
zip -r9 ${OLDPWD}/artifacts/kohlsScraper.zip .
|
||||
cd ${OLDPWD}
|
||||
zip -r9 artifacts/kohlsScraper.zip *.json
|
||||
zip -r9 artifacts/kohlsScraper.zip KohlsScraper.py
|
||||
6801
Lambdas/Scraping/nounlist.txt
Normal file
6801
Lambdas/Scraping/nounlist.txt
Normal file
File diff suppressed because it is too large
Load Diff
25
Lambdas/Scraping/prefix_list_builder.py
Normal file
25
Lambdas/Scraping/prefix_list_builder.py
Normal file
@ -0,0 +1,25 @@
|
||||
wordlist = []
|
||||
with open("nounlist.txt") as nounlist:
|
||||
for noun in nounlist:
|
||||
wordlist.append(noun)
|
||||
|
||||
prefix_list = []
|
||||
for word in wordlist:
|
||||
prefix_list.append(word[:min(len(word), 3)])
|
||||
|
||||
short_list = []
|
||||
short_list2 = []
|
||||
for prefix in prefix_list:
|
||||
prefix = prefix.strip()
|
||||
if len(short_list) < 700:
|
||||
if (len(short_list) == 0 or short_list[-1] != prefix):
|
||||
short_list.append(prefix)
|
||||
else:
|
||||
if ((len(short_list2) == 0 or short_list2[-1] != prefix) and short_list[-1] != prefix):
|
||||
short_list2.append(prefix)
|
||||
|
||||
with open("prefix_list_part1.txt", "w") as prefix_list_part1:
|
||||
json.dump(short_list, prefix_list_part1)
|
||||
|
||||
with open("prefix_list_part2.txt", "w") as prefix_list_part2:
|
||||
json.dump(short_list2, prefix_list_part2)
|
||||
1
Lambdas/Scraping/prefix_list_part1.txt
Normal file
1
Lambdas/Scraping/prefix_list_part1.txt
Normal file
@ -0,0 +1 @@
|
||||
["ATM", "CD", "SUV", "TV", "aar", "aba", "abb", "abd", "abi", "abn", "abo", "abr", "abs", "abu", "aca", "acc", "ace", "ach", "aci", "ack", "aco", "acq", "acr", "act", "acu", "ad", "ada", "add", "adj", "adm", "ado", "adr", "adu", "adv", "aff", "afo", "aft", "age", "agg", "agl", "ago", "agr", "aid", "aim", "air", "ala", "alb", "alc", "ald", "ale", "alf", "alg", "ali", "all", "alm", "alp", "alt", "alu", "ama", "amb", "ame", "amm", "amn", "amo", "amu", "ana", "anc", "and", "ane", "ang", "ani", "ank", "ann", "ano", "ans", "ant", "anx", "any", "apa", "ape", "apo", "app", "apr", "aps", "aqu", "arc", "are", "arg", "ari", "ark", "arm", "arr", "art", "asc", "ash", "asi", "asp", "ass", "ast", "asy", "ate", "ath", "atm", "ato", "atr", "att", "auc", "aud", "aun", "aut", "ava", "ave", "avo", "awa", "awe", "axi", "azi", "bab", "bac", "bad", "baf", "bag", "bai", "bak", "bal", "bam", "ban", "bao", "bar", "bas", "bat", "bay", "bea", "bec", "bed", "bee", "beg", "beh", "bei", "bel", "ben", "ber", "bes", "bet", "bev", "bey", "bia", "bib", "bic", "bid", "bif", "bij", "bik", "bil", "bin", "bio", "bip", "bir", "bis", "bit", "bla", "ble", "bli", "blo", "blu", "boa", "bob", "bod", "bog", "bol", "bom", "bon", "boo", "bor", "bos", "bot", "bou", "bow", "box", "boy", "bra", "bre", "bri", "bro", "bru", "bub", "buc", "bud", "buf", "bug", "bui", "bul", "bum", "bun", "bur", "bus", "but", "buy", "buz", "c-c", "cab", "cac", "cad", "caf", "cag", "cak", "cal", "cam", "can", "cap", "car", "cas", "cat", "cau", "cav", "cay", "cei", "cel", "cem", "cen", "cep", "cer", "ces", "cha", "che", "chi", "cho", "chr", "chu", "cic", "cig", "cil", "cin", "cir", "cit", "civ", "cla", "cle", "cli", "clo", "clu", "co-", "coa", "cob", "coc", "cod", "coe", "cof", "coh", "coi", "cok", "col", "com", "con", "coo", "cop", "cor", "cos", "cot", "cou", "cov", "cow", "coy", "cra", "cre", "cri", "cro", "cru", "cry", "cub", "cuc", "cue", "cuf", "cui", "cul", "cum", "cup", "cur", "cus", "cut", "cyc", "cyg", "cyl", "cym", "cyn", "cys", "cyt", "dad", "daf", "dag", "dah", "dai", "dam", "dan", "dar", "das", "dat", "dau", "daw", "day", "dea", "deb", "dec", "ded", "dee", "def", "deg", "del", "dem", "den", "deo", "dep", "der", "des", "det", "dev", "dew", "dho", "dia", "dib", "dic", "die", "dif", "dig", "dil", "dim", "din", "dio", "dip", "dir", "dis", "div", "doc", "doe", "dog", "doi", "dol", "dom", "don", "doo", "dor", "dos", "dot", "dou", "dow", "doz", "dra", "dre", "dri", "dro", "dru", "dry", "duc", "dud", "due", "duf", "dug", "dul", "dum", "dun", "dup", "dur", "dus", "dut", "dwa", "dwe", "dyn", "dys", "e-b", "e-m", "e-r", "eag", "ear", "eas", "eat", "eav", "ecc", "ech", "ecl", "eco", "ect", "ecu", "edd", "edg", "edi", "edu", "eel", "eff", "egg", "ego", "eic", "eje", "elb", "eld", "ele", "elf", "eli", "elk", "ell", "elm", "elo", "elv", "ema", "emb", "eme", "emi", "emo", "emp", "emu", "ena", "enc", "end", "ene", "enf", "eng", "eni", "enj", "enq", "enr", "ent", "env", "enz", "epa", "epe", "eph", "epi", "epo", "equ", "era", "ere", "ero", "err", "esc", "esp", "ess", "est", "ete", "eth", "eup", "eur", "eva", "eve", "evi", "evo", "ex-", "exa", "exc", "exe", "exh", "exi", "exo", "exp", "ext", "eye", "eyr", "fab", "fac", "fah", "fai", "fal", "fam", "fan", "far", "fas", "fat", "fau", "fav", "faw", "fax", "fea", "fed", "fee", "fel", "fem", "fen", "fer", "fes", "fet", "few", "fib", "fic", "fid", "fie", "fif", "fig", "fil", "fin", "fir", "fis", "fit", "fix", "fla", "fle", "fli", "flo", "flu", "fly", "foa", "fob", "foc", "fog", "fol", "fon", "foo", "for", "fou", "fow", "fox", "fra", "fre", "fri", "fro", "fru", "fry", "fuc", "fue", "fug", "ful", "fun", "fur", "fus", "fut", "gad", "gaf", "gai", "gal", "gam", "gan", "gap", "gar", "gas", "gat", "gau", "gav", "gaz", "gea", "gee", "gel", "gem", "gen", "geo", "ger", "ges", "gey", "ghe", "gho", "gia", "gif", "gig", "gin", "gir", "git", "gla", "gle", "gli", "glo", "glu", "gna", "gnu", "go-", "goa", "gob", "god", "gog", "goi", "gol", "gon", "goo", "gop", "gor", "gos", "gov", "gow", "gra", "gre", "gri", "gro", "gru", "gua", "gue", "gui", "gum", "gun", "gut", "guy", "gym", "gyn", "gyr", "hab", "hac", "hai", "hak", "hal", "ham", "han", "hap", "har", "has", "hat", "hau", "hav", "haw", "hay", "haz", "hea", "hec", "hed", "hee", "hei", "hel", "hem", "hen", "hep", "her", "hes", "het", "hex", "hey", "hic", "hid", "hie", "hig", "hik", "hil", "hin", "hip", "hir", "his", "hit", "hiv", "hob", "hoc", "hoe", "hog", "hol", "hom", "hon", "hoo", "hop", "hor", "hos", "hot", "hou", "hov", "how", "hub", "hug", "hul", "hum", "hun", "hur", "hus", "hut", "hya", "hyb", "hyd", "hye", "hyg", "hyp", "ice", "ici", "ico", "icy", "id", "ide", "idi", "igl", "ign", "ike", "ill", "ima", "imb", "imi", "imm", "imp", "in-", "ina", "inb", "inc", "ind", "ine", "inf", "ing", "inh", "ini", "inj", "ink", "inl", "inn", "inp", "inq", "ins", "int", "inv", "iri", "iro", "irr", "isc", "isl", "iso"]
|
||||
1
Lambdas/Scraping/prefix_list_part2.txt
Normal file
1
Lambdas/Scraping/prefix_list_part2.txt
Normal file
File diff suppressed because one or more lines are too long
19
Lambdas/Scraping/runOrchestrator.py
Normal file
19
Lambdas/Scraping/runOrchestrator.py
Normal file
@ -0,0 +1,19 @@
|
||||
import json
|
||||
import boto3
|
||||
|
||||
def lambda_handler(event, context):
|
||||
with open("words.txt") as words_file:
|
||||
words = json.load(words_file)
|
||||
print(words)
|
||||
for word in words:
|
||||
client = boto3.client('lambda')
|
||||
response = client.invoke(
|
||||
FunctionName='KohlsScraper',
|
||||
InvocationType="Event",
|
||||
LogType="None",
|
||||
Payload= """{"toScrape": \"""" + word + "\"}"
|
||||
)
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'body': json.dumps('Hello from Lambda!')
|
||||
}
|
||||
7
Lambdas/Scraping/setupBuildFolder.sh
Normal file
7
Lambdas/Scraping/setupBuildFolder.sh
Normal file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#Currently to be run only from the Scraping directory
|
||||
mkdir build
|
||||
pip3 install --target build requests
|
||||
pip3 install --target build PyMySQL
|
||||
pip3 install --target build beautifulsoup4
|
||||
mkdir artifacts
|
||||
Loading…
Reference in New Issue
Block a user