mirror of
https://github.com/ClaytonWWilson/Scraper-for-theTVDB.com.git
synced 2025-12-13 00:58:47 +00:00
298 lines
12 KiB
Python
298 lines
12 KiB
Python
import os
|
|
import shutil
|
|
import requests
|
|
import datetime
|
|
import dateutil
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
|
|
from utils import APIConnector
|
|
from utils import create_file_name
|
|
from utils import ProgressBar
|
|
|
|
|
|
# TODO add counters for number of images downloaded and deleted
|
|
|
|
def wait():
|
|
input("Press enter to continue.")
|
|
|
|
|
|
# Downloads all data for a series
|
|
def download(series):
|
|
# Create downloads folder
|
|
if not os.path.exists("downloads"):
|
|
os.makedirs("downloads")
|
|
|
|
# Remove previously downloaded content for this series if it exists
|
|
if os.path.exists(os.path.join("downloads", series.folder_name)):
|
|
# BUG Sometimes this call will not remove the entire folder, causing crashes to occur with makedirs
|
|
shutil.rmtree(os.path.join("downloads", series.folder_name), ignore_errors=True)
|
|
|
|
# Create series folder
|
|
os.makedirs(os.path.join("downloads", series.folder_name), exist_ok=True)
|
|
|
|
print("Downloading data for " + series.name)
|
|
|
|
api_con = APIConnector()
|
|
|
|
# Download series text data to info.json
|
|
api_path = "https://api.thetvdb.com/series/{}".format(series.id)
|
|
res = api_con.send_http_req(api_path)
|
|
|
|
info_path = os.path.join("downloads", series.folder_name, "info.json")
|
|
|
|
with open(info_path, 'wb') as f:
|
|
f.write(res.content)
|
|
|
|
|
|
|
|
# Make a folder for actors
|
|
actors_folder_path = os.path.join("downloads", series.folder_name, "actors")
|
|
os.makedirs(actors_folder_path)
|
|
|
|
# Download actors to actors.json
|
|
api_path = "https://api.thetvdb.com/series/{}/actors".format(series.id)
|
|
res = api_con.send_http_req(api_path)
|
|
|
|
actors_path = os.path.join("downloads", series.folder_name, "actors", "actors.json")
|
|
|
|
with open(actors_path, 'wb') as f:
|
|
f.write(res.content)
|
|
|
|
# Make folder for actor profile images
|
|
actors_profile_folder_path = os.path.join("downloads", series.folder_name, "actors", "profiles")
|
|
os.makedirs(actors_profile_folder_path)
|
|
|
|
# Count the number of actor profile pictures that will be downloaded
|
|
amount = 0
|
|
for actor in json.loads(res.content)["data"]:
|
|
amount += 1
|
|
|
|
# Create a progress bar
|
|
progress_bar = ProgressBar(amount)
|
|
|
|
# Download each actor's profile picture and save it as their name
|
|
for actor in json.loads(res.content)["data"]:
|
|
# Print progress bar to the screen
|
|
sys.stdout.write("\rDownloading Actors... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), str(actor["id"]) + ".jpg"))
|
|
sys.stdout.flush()
|
|
|
|
|
|
name = create_file_name(actor["name"])
|
|
|
|
# Check if there is an image for the actor
|
|
if actor["image"] != "":
|
|
# print("downloading " + actor["image"])
|
|
img_res = requests.get("https://www.thetvdb.com/banners/" + actor["image"])
|
|
with open(os.path.join(actors_profile_folder_path, name + '_' + str(actor["id"]) + ".jpg"), 'wb') as f:
|
|
f.write(img_res.content)
|
|
else:
|
|
# Use a default image if one does not exist on theTVDB.com
|
|
shutil.copyfile(os.path.join("resources", "default_person.jpg"), os.path.join(actors_profile_folder_path, name + '_' + str(actor["id"]) + ".jpg"))
|
|
|
|
progress_bar.increment()
|
|
|
|
# Print that the operation is done
|
|
sys.stdout.write("\rDownloading Actors... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Done. "))
|
|
sys.stdout.flush()
|
|
print()
|
|
|
|
|
|
|
|
# Make a folder for episodes
|
|
episodes_folder_path = os.path.join("downloads", series.folder_name, "episodes")
|
|
os.makedirs(episodes_folder_path)
|
|
|
|
|
|
# Get number of seasons
|
|
api_path = "https://api.thetvdb.com/series/{}/episodes/summary".format(series.id)
|
|
res = api_con.send_http_req(api_path)
|
|
seasons = json.loads(res.content)["data"]["airedSeasons"]
|
|
|
|
# Create a folder for each season
|
|
for season in seasons:
|
|
season_folder_path = os.path.join(episodes_folder_path, "Season " + season)
|
|
os.makedirs(season_folder_path)
|
|
|
|
# Download episode info to episodes.json
|
|
api_path = "https://api.thetvdb.com/series/{}/episodes".format(series.id)
|
|
res = api_con.send_http_req(api_path)
|
|
with open(os.path.join(episodes_folder_path, "episodes.json"), 'wb') as f:
|
|
f.write(res.content)
|
|
|
|
# Count the number of episode pictures and data that will be downloaded
|
|
amount = 0
|
|
for episode in json.loads(res.content)["data"]:
|
|
amount += 1
|
|
|
|
progress_bar = ProgressBar(amount * 2)
|
|
|
|
# Seperate episode data into individual json files for each episode and download episode thumbnails
|
|
for episode in json.loads(res.content)["data"]:
|
|
episode_path = os.path.join(episodes_folder_path, "Season " + str(episode["airedSeason"]), "Episode {} - {}".format(str(episode["airedEpisodeNumber"]), episode["episodeName"]))
|
|
img_res = requests.get("https://www.thetvdb.com/banners/" + episode["filename"])
|
|
with open(episode_path + ".json", 'w') as f:
|
|
# Update progress bar and display it in the terminal
|
|
sys.stdout.write("\rDownloading Episodes... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Episode {} - {}".format(str(episode["airedEpisodeNumber"]), episode["episodeName"] + ".json ")))
|
|
sys.stdout.flush()
|
|
f.write(json.dumps(episode))
|
|
progress_bar.increment()
|
|
with open(episode_path + ".jpg", 'wb') as f:
|
|
# Update progress bar and display it in the terminal
|
|
sys.stdout.write("\rDownloading Episodes... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Episode {} - {}".format(str(episode["airedEpisodeNumber"]), episode["episodeName"] + ".jpg ")))
|
|
sys.stdout.flush()
|
|
f.write(img_res.content)
|
|
progress_bar.increment()
|
|
|
|
sys.stdout.write("\rDownloading Episodes... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Done. "))
|
|
sys.stdout.flush()
|
|
print()
|
|
|
|
|
|
# Make a folder for images
|
|
images_folder_path = os.path.join("downloads", series.folder_name, "images")
|
|
os.makedirs(images_folder_path)
|
|
|
|
# Make a folder for each image type
|
|
banners_folder_path = os.path.join(images_folder_path, "banners")
|
|
os.makedirs(banners_folder_path)
|
|
fanart_folder_path = os.path.join(images_folder_path, "fanart")
|
|
os.makedirs(fanart_folder_path)
|
|
posters_folder_path = os.path.join(images_folder_path, "posters")
|
|
os.makedirs(posters_folder_path)
|
|
|
|
# The API doesn't like to send links to all of the images hosted on the website,
|
|
# so the best option to get every images is to scrape the website directly
|
|
|
|
# Download banners
|
|
banners_page = requests.get("{}/artwork/banners".format(series.url))
|
|
banners_soup = BeautifulSoup(banners_page.content, "html.parser")
|
|
|
|
# Make a progress bar for the banners
|
|
amount = 0
|
|
for image in banners_soup.find_all("img", attrs={"class": "media-object img-responsive"}):
|
|
amount += 1
|
|
|
|
progress_bar = ProgressBar(amount)
|
|
|
|
|
|
counter = 0
|
|
for image in banners_soup.find_all("img", attrs={"class":"media-object img-responsive"}):
|
|
sys.stdout.write("\rDownloading Banners... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "{:03d}.jpg".format(counter)))
|
|
sys.stdout.flush()
|
|
|
|
image_res = requests.get(image["src"])
|
|
with open(os.path.join(banners_folder_path, "{:03d}.jpg".format(counter)), 'wb') as f:
|
|
f.write(image_res.content)
|
|
counter+=1
|
|
progress_bar.increment()
|
|
|
|
sys.stdout.write("\rDownloading Banners... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Done. "))
|
|
sys.stdout.flush()
|
|
print()
|
|
|
|
|
|
# Download fanart
|
|
fanart_page = requests.get("{}/artwork/fanart".format(series.url))
|
|
fanart_soup = BeautifulSoup(fanart_page.content, "html.parser")
|
|
|
|
# Make a progress bar for the fanart
|
|
amount = 0
|
|
for image in fanart_soup.find_all("img", attrs={"class": "media-object img-responsive"}):
|
|
amount += 1
|
|
|
|
progress_bar = ProgressBar(amount)
|
|
|
|
|
|
counter = 0
|
|
for image in fanart_soup.find_all("img", attrs={"class":"media-object img-responsive"}):
|
|
sys.stdout.write("\rDownloading Fanart... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "{:03d}.jpg".format(counter)))
|
|
sys.stdout.flush()
|
|
image_res = requests.get(image["src"])
|
|
with open(os.path.join(fanart_folder_path, "{:03d}.jpg".format(counter)), 'wb') as f:
|
|
f.write(image_res.content)
|
|
counter+=1
|
|
progress_bar.increment()
|
|
|
|
sys.stdout.write("\rDownloading Fanart... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Done. "))
|
|
sys.stdout.flush()
|
|
print()
|
|
|
|
# Download posters
|
|
posters_page = requests.get("{}/artwork/poster".format(series.url))
|
|
posters_soup = BeautifulSoup(posters_page.content, "html.parser")
|
|
|
|
# Make a progress bar for the posters
|
|
amount = 0
|
|
for image in posters_soup.find_all("img", attrs={"class": "media-object img-responsive"}):
|
|
amount += 1
|
|
|
|
progress_bar = ProgressBar(amount)
|
|
|
|
counter = 0
|
|
for image in posters_soup.find_all("img", attrs={"class":"media-object img-responsive"}):
|
|
sys.stdout.write("\rDownloading Posters... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "{:03d}.jpg".format(counter)))
|
|
sys.stdout.flush()
|
|
image_res = requests.get(image["src"])
|
|
with open(os.path.join(posters_folder_path, "{:03d}.jpg".format(counter)), 'wb') as f:
|
|
f.write(image_res.content)
|
|
counter+=1
|
|
progress_bar.increment()
|
|
|
|
sys.stdout.write("\rDownloading Posters... {} {}% {}".format(progress_bar.to_string(), progress_bar.get_percent(), "Done. "))
|
|
sys.stdout.flush()
|
|
print()
|
|
|
|
def installReqs():
|
|
if is_pip_installed() == True:
|
|
with open("requirements.txt") as f:
|
|
reqs = f.readlines()
|
|
reqs = [x.strip() for x in reqs]
|
|
|
|
for module in reqs:
|
|
print("Installing {}... ".format(module))
|
|
subprocess.call(["pip", "install", module], stdout=subprocess.DEVNULL,
|
|
stdin =subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL)
|
|
print("Done!\n")
|
|
else:
|
|
print("You need to install pip.")
|
|
|
|
def is_pip_installed():
|
|
try:
|
|
subprocess.call(["pip", "--version"], stdout=subprocess.DEVNULL,
|
|
stdin =subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL)
|
|
except FileNotFoundError:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
# The following code is from Red-DiscordBot
|
|
# https://github.com/Cog-Creators/Red-DiscordBot
|
|
def is_git_installed():
|
|
try:
|
|
subprocess.call(["git", "--version"], stdout=subprocess.DEVNULL,
|
|
stdin =subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL)
|
|
except FileNotFoundError:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def update():
|
|
try:
|
|
code = subprocess.call(("git", "pull", "--ff-only"))
|
|
except FileNotFoundError:
|
|
print("\nError: Git not found. It's either not installed or you did "
|
|
"not clone this using git. Install instructions are on the GitHub: "
|
|
"https://github.com/ClaytonWWilson/Scraper-for-theTVDB.com")
|
|
return
|
|
if code == 0:
|
|
print("\nUpdating complete.\n")
|
|
else:
|
|
print("\nThere was an error while updating. This may be caused by edits "
|
|
"you have made to the code.")
|