Exercise: thread URL requests.

In the following script we fetch the URLs listed in a file:

{% embed include file="src/examples/parallel/urls.txt)

It takes about 1.5-2 sec / URL from home. (It depends on a lot of factors including your network connection.)

import time
import requests
import sys
from bs4 import BeautifulSoup

def get_urls(limit):
    with open('urls.txt') as fh:
        urls = list(map(lambda line: line.rstrip("\n"), fh))
    if len(urls) > limit:
        urls = urls[:limit]

    return urls

def get_title(url):
    try:
        resp = requests.get(url)
        if resp.status_code != 200:
            return None, f"Incorrect status_code {resp.status_code} for {url}"
    except Exception as err:
        return None, f"Error: {err} for {url}"

    soup = BeautifulSoup(resp.content, 'html.parser')
    return soup.title.string, None

def main():
    if len(sys.argv) < 2:
        exit(f"Usage: {sys.argv[0]} LIMIT")
    limit = int(sys.argv[1])
    urls = get_urls(limit)
    print(urls)
    start = time.time()

    titles = []
    for url in urls:
        #print(f"Processing {url}")
        title, err = get_title(url)
        if err:
            print(err)
        else:
            print(title)
        titles.append({
            "url": url,
            "title": title,
            "err": err,
        })
    end = time.time()
    print("Elapsed time: {} for {} pages.".format(end-start, len(urls)))
    print(titles)


if __name__ == '__main__':
    main()

Create a version of the above script that can use K threads.

Keyboard shortcuts

Python in parallel

Exercise: thread URL requests.