python 2.7 - Beautiful Soup - Unable to scrape links from paginated pages -
i'm unable scrape links of articles present in paginated webpages. additionally blank screen @ times output. unable find problem in loop. csv file doesn't created.
from pprint import pprint import requests bs4 import beautifulsoup import lxml import csv import urllib2 def get_url_for_search_key(search_key): in range(1,100): base_url = 'http://www.thedrum.com/' response = requests.get(base_url + 'search?page=%s&query=' + search_key +'&sorted=')%i soup = beautifulsoup(response.content, "lxml") results = soup.findall('a') return [url['href'] url in soup.findall('a')] pprint(get_url_for_search_key('artificial intelligence')) open('storeurl.csv', 'w+') f: f.seek(0) f.write('\n'.join(get_url_for_search_key('artificial intelligence')))
are sure, need first 100 pages? maybe there's more of them...
my vision of task below, collect links pages , precisely catches next page button links:
import requests bs4 import beautifulsoup base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence' response = requests.get(base_url) soup = beautifulsoup(response.content, "lxml") res = [] while 1: results = soup.findall('a') res.append([url['href'] url in soup.findall('a')]) next_button = soup.find('a', text='next page') if not next_button: break response = requests.get(next_button['href']) soup = beautifulsoup(response.content, "lxml") edit: alternative approach collecting article links:
import requests bs4 import beautifulsoup base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence' response = requests.get(base_url) soup = beautifulsoup(response.content, "lxml") res = [] while 1: search_results = soup.find('div', class_='search-results') #localizing search window article links article_link_tags = search_results.findall('a') #ordinary scheme goes further res.append([url['href'] url in article_link_tags]) next_button = soup.find('a', text='next page') if not next_button: break response = requests.get(next_button['href']) soup = beautifulsoup(response.content, "lxml") to print links use:
for in res: j in i: print(j)
Comments
Post a Comment