'''@author :Eric-chen@contact:809512722@qq.com@time :2018/1/3 17:55@desc :通过爬取http://movie.douban.com/top250/得到豆瓣Top 250的电影,并输出到文件movies.txt'''import codecsimport requestsfrom bs4 import BeautifulSoupDOWNLOAD_URL = 'http://movie.douban.com/top250/'def download_page(url): return requests.get(url).contentdef parse_html(html): soup = BeautifulSoup(html,"lxml") movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) movie_name_list = [] for movie_li in movie_list_soup.find_all('li'): detail = movie_li.find('div', attrs={'class': 'hd'}) movie_name = detail.find('span', attrs={'class': 'title'}).getText() movie_name_list.append(movie_name) next_page = soup.find('span', attrs={'class': 'next'}).find('a') if next_page: return movie_name_list, DOWNLOAD_URL + next_page['href'] return movie_name_list, Nonedef main(): url = DOWNLOAD_URL with codecs.open('movies.txt', 'wb', encoding='utf-8') as fp: while url: html = download_page(url) movies, url = parse_html(html) fp.write(u'{movies}\n'.format(movies='\n'.join(movies)))if __name__ == '__main__': main()