Py爬网页上的代理IP地址

使用Python抓取网页上的IP地址。

第一步找一个有IP有网页
第二步抓取页面
第三步提取IP

HTTP GET请求代码

import httplib
import re
import StringIO
import gzip
import json

def http_get(host, url, body, header):
    try:
        conn = httplib.HTTPConnection(host, None, None, 20)
        conn.request("GET", url, body, header)
        res = conn.getresponse()
        res_str = res.read()
        encoding = res.getheader('Content-Encoding')
        conn.close()
        if encoding == 'gzip':
            stream = StringIO.StringIO(res_str)
            gzipper = gzip.GzipFile(fileobj=stream)
            res_str = gzipper.read()
        return res_str
    except Exception:
        return None

提取网页上的ip（正则写的不好）返回ip列表

MainHeader = {'Host': 'www.66ip.cn',
              'Cookie': '111111111111111111111',
              'Accept': 'text/xml, application/xml, application/xhtml+xml, text/html;q=0.9, text/plain;q=0.8, text/css, '
                        'image/png, image/jpeg, image/gif;q=0.8, application/x-shockwave-flash, video/mp4;q=0.9, '
                        'flv-application/octet-stream;q=0.8, video/x-flv;q=0.7, audio/mp4, application/futuresplash, '
                        '*/*;q=0.5',
              'User-Agent': 'Mozilla/5.0 (Android; U; zh-CN) AppleWebKit/533.19.4 (KHTML, like Gecko) AdobeAIR/23.0',
              'x-flash-version': '23,0,0,162',
              'Connection': 'Keep-Alive', 'Cache-Control': 'no-cache',
              'Referer': 'app:/assets/CardMain.swf',
              'Content-Type': 'application/x-www-form-urlencoded'
              }
def get_ip_list():
    host = 'www.66ip.cn'
    url = '/nmtq.php'
    body = 'getnum=10&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip'
    ret = http_get(host, url, body, MainHeader)
    if ret:
        pattern = '(((?:(?:25[0-5]|2[0-4]\d|((1\d{2})|([1-9]?\d)))\.){3}(?:25[0-5]|2[0-4]\d|((1\d{2})|([1-9]?\d))))\:[1-9]\d*)'
        pattern = '(\d*\.\d*\.\d*\.\d*:[1-9]\d*)'
        string = ret
        ips = re.findall(pattern, string, flags=0)
        # for x in ips:
        #    DebugPrint(x)
        return ips
    return []