Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Solution: Fetch URLs in parallel

  • First create function and use regular map.
  • Deal with encoding.
  • Replace continue by return, include None in results.
  • It has some 2 sec overhead, but then 20 items reduced from 18 sec to 7 sec using pool of 5.
import time
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from multiprocessing import Pool
import os


def get_urls(content):
    urls = []
    root = ET.fromstring(content)
    for child in root:
        for ch in child:
            if ch.tag.endswith('loc'):
                urls.append(ch.text)

    #print(len(urls)) # 2653
    MAX = 20
    if len(urls) > MAX:
        urls = urls[:MAX]

    return urls

def get_title(url):
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Incorrect status_code {resp.status_code} for {url}")
        return

    soup = BeautifulSoup(resp.content, 'html.parser')
    print(soup.title.string)
    return soup.title.string.encode('utf-8')


def main():
    start = time.time()
    url = 'https://code-maven.com/slides/sitemap.xml'
    resp = requests.get(url)
    if resp.status_code != 200:
        exit(f"Incorrect status_code {resp.status_code}")

    urls = get_urls(resp.content)

    titles = []
#     for url in urls:
#         titles.append(get_title(url))
#     titles = list(map(get_title, urls))
    with Pool(5) as pool:
        results = pool.map(get_title, urls)
    for r in results:
        titles.append(r)
    end = time.time()
    print("Elapsed time: {} for {} pages.".format(end-start, len(urls)))
    print(list(titles))
    print("DONE")


if __name__ == '__main__':
   main()