- First create function and use regular map.
- Deal with encoding.
- Replace continue by return, include None in results.
- It has some 2 sec overhead, but then 20 items reduced from 18 sec to 7 sec using pool of 5.
import time
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from multiprocessing import Pool
import os
def get_urls(content):
urls = []
root = ET.fromstring(content)
for child in root:
for ch in child:
if ch.tag.endswith('loc'):
urls.append(ch.text)
#print(len(urls)) # 2653
MAX = 20
if len(urls) > MAX:
urls = urls[:MAX]
return urls
def get_title(url):
resp = requests.get(url)
if resp.status_code != 200:
print(f"Incorrect status_code {resp.status_code} for {url}")
return
soup = BeautifulSoup(resp.content, 'html.parser')
print(soup.title.string)
return soup.title.string.encode('utf-8')
def main():
start = time.time()
url = 'https://code-maven.com/slides/sitemap.xml'
resp = requests.get(url)
if resp.status_code != 200:
exit(f"Incorrect status_code {resp.status_code}")
urls = get_urls(resp.content)
titles = []
# for url in urls:
# titles.append(get_title(url))
# titles = list(map(get_title, urls))
with Pool(5) as pool:
results = pool.map(get_title, urls)
for r in results:
titles.append(r)
end = time.time()
print("Elapsed time: {} for {} pages.".format(end-start, len(urls)))
print(list(titles))
print("DONE")
if __name__ == '__main__':
main()