爬取 stackoverflow

爬取 stackoverflow 内容

#!/usr/bin/env python 
# -*- coding:utf-8 -*-


import cookielib
import urllib2
import re
import sys
from bs4 import BeautifulSoup


def open_url_get_html(url):
    response = urllib2.urlopen(url)
    return_code = response.getcode()
    if return_code == 200:
        html = response.read()
        return html
    else:
        return ''


if __name__ == "__main__":
    # 产生url
    url = 'https://stackoverflow.com/questions/50000000'
    # 获取数据
    html_doc = open_url_get_html(url)
    if html_doc == '':
        print "未获得返回html,退出程序"
        sys.exit(0)
    print html_doc

    # 创建一个BeautifulSoup解析对象
    soup = BeautifulSoup(html_doc, "html.parser", from_encoding="utf-8")
    # 获取所有的链接
    links = soup.find_all('a')

    # 获取某些类
    question = soup.find_all("a", class_="question-hyperlink")
    question_content = soup.find_all("div", class_="post-text")

    print question

  
    展开阅读全文