c/o our slack channel (check out the site FAQ for more info on that) comes a Python3 script to collate old GDC vault entries into a CSV file. This can be handy when you’re trying to remember a talk you saw some years ago and whose title or speaker you can’t quite recall…
import re
import mechanize
from bs4 import BeautifulSoup
import urllib
import urllib.parse
from http import cookiejar
import argparse
import csv
home_page = "https://gdcvault.com/"
login_page = urllib.parse.urljoin(home_page, "login")
resources = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")
cookies_file = os.path.join(resources, "cookies")
conf_list = os.path.join(resources, "conferences")
def login(browser, email, password):
for i, form in enumerate(browser.forms()):
if not str(form.attrs["id"]) == "form_login":
continue
print("Trying to login...")
if not email:
email = input("email")
if not password:
password = input("password")
browser.select_form(nr=i)
browser.form["email"] = email
browser.form["password"] = password
browser.submit()
browser._ua_handlers["_cookies"].cookiejar.save(
cookies_file, ignore_discard=True, ignore_expires=True
)
def get_session_info(session):
pass
def get_content(browser, page):
content = {}
found_conf_name = False
soup = read_page(browser, page)
data = content.setdefault("data", [])
for i, session_item in enumerate(soup.findAll(class_="session_item")):
session_href = session_item.get("href")
if session_href:
session_id = str(session_href).split("/")[2]
else:
onClick = session_item.get("onclick")
if not onClick:
print(f"Could not find onClick for {session_item}")
raise
match = re.search("cid=([0-9]+)", onClick)
if not match:
print(f"Failed to parse {onClick}")
continue
session_id = match.groups()[0]
print(f"Fetching {session_id}")
session_link = urllib.parse.urljoin(home_page, f"play/{session_id}")
sd = {}
if not found_conf_name:
content["conference"] = session_item.findChild(
class_="conference_name"
).text.strip()
found_conf_name = True
sd["id"] = session_id
sd["members_only"] = bool(session_item.findChild(class_="members"))
sd["type"] = session_item.findChild(class_="media_type_image")["class"][-1]
page = read_page(browser, session_link)
player = page.find(id="player")
if not player:
print(f"Failed to find player for {session_link}")
continue
tags = []
tags_element = player.findChild(id="tags")
if tags_element:
for tag in tags_element.findChildren("p"):
tags.append(tag.text.strip())
sd["tags"] = tags
iframe = player.findChild("iframe")
if iframe:
sd["source"] = urllib.parse.urljoin(home_page, iframe["src"])
table = player.findChild("table")
for row in table.findAll("tr"):
cells = row.findAll("td")
field = cells[0].text.strip().lower()
value = cells[1].text.strip()
if "session" in field:
field = "name"
elif "company name" in field:
field = "company"
elif "speaker" in field:
field = "speaker"
elif "track" in field:
field = "track"
elif "overview" in field:
field = "overview"
else:
print(f"Unsupported field: {field}")
continue
sd[field] = value
data.append(sd)
return content
def scrape(email, password):
jar = cookiejar.LWPCookieJar(cookies_file)
if os.path.exists(cookies_file):
jar.load(cookies_file, ignore_discard=True)
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.addheaders = [
(
"User-agent",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1",
)
]
browser.set_cookiejar(jar)
browser.open(login_page)
soup = read_page(browser)
welcome = soup.find(id="welcome_user_name").text.split(",")[-1].strip()
logged_in = bool(welcome)
if logged_in:
print(f"Logged in as {welcome}.")
else:
print("Attempting to login...")
login(browser, email, password)
browse_root = urllib.parse.urljoin(home_page, "browse")
soup = read_page(browser, browse_root)
csv_columns = [
"id",
"name",
"members_only",
"type",
"source",
"speaker",
"company",
"track",
"overview",
"tags",
]
for conference_div in soup.find_all(class_="show_all"):
link = conference_div.findChild("a")
conf_link = urllib.parse.urljoin(browse_root, link.get("href"))
content = get_content(browser, conf_link)
conference = content.pop("conference")
conf_file = os.path.join(conf_list, conference + ".csv")
with open(conf_file, "w") as fp:
writer = csv.DictWriter(fp, fieldnames=csv_columns)
writer.writeheader()
for data in content["data"]:
writer.writerow(data)
def read_page(browser, page=None):
if page:
browser.open(page)
return BeautifulSoup(browser.response().read(), features="html5lib")
def main():
parser = argparse.ArgumentParser(description="Scrape GDC vault for password")
parser.add_argument("-e", "--email", help="email to use for login")
parser.add_argument("-p", "--password", help="Password to use for login")
args = parser.parse_args()
scrape(email=args.email, password=args.password)
if __name__ == "__main__":
main()