使用Python抓取网页上的IP地址。

第一步找一个有IP有网页
第二步抓取页面
第三步提取IP

HTTP GET请求代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import httplib
import re
import StringIO
import gzip
import json

def http_get(host, url, body, header):
try:
conn = httplib.HTTPConnection(host, None, None, 20)
conn.request("GET", url, body, header)
res = conn.getresponse()
res_str = res.read()
encoding = res.getheader('Content-Encoding')
conn.close()
if encoding == 'gzip':
stream = StringIO.StringIO(res_str)
gzipper = gzip.GzipFile(fileobj=stream)
res_str = gzipper.read()
return res_str
except Exception:
return None

提取网页上的ip(正则写的不好)返回ip列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
MainHeader = {'Host': 'www.66ip.cn',
'Cookie': '111111111111111111111',
'Accept': 'text/xml, application/xml, application/xhtml+xml, text/html;q=0.9, text/plain;q=0.8, text/css, '
'image/png, image/jpeg, image/gif;q=0.8, application/x-shockwave-flash, video/mp4;q=0.9, '
'flv-application/octet-stream;q=0.8, video/x-flv;q=0.7, audio/mp4, application/futuresplash, '
'*/*;q=0.5',
'User-Agent': 'Mozilla/5.0 (Android; U; zh-CN) AppleWebKit/533.19.4 (KHTML, like Gecko) AdobeAIR/23.0',
'x-flash-version': '23,0,0,162',
'Connection': 'Keep-Alive', 'Cache-Control': 'no-cache',
'Referer': 'app:/assets/CardMain.swf',
'Content-Type': 'application/x-www-form-urlencoded'
}
def get_ip_list():
host = 'www.66ip.cn'
url = '/nmtq.php'
body = 'getnum=10&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip'
ret = http_get(host, url, body, MainHeader)
if ret:
pattern = '(((?:(?:25[0-5]|2[0-4]\d|((1\d{2})|([1-9]?\d)))\.){3}(?:25[0-5]|2[0-4]\d|((1\d{2})|([1-9]?\d))))\:[1-9]\d*)'
pattern = '(\d*\.\d*\.\d*\.\d*:[1-9]\d*)'
string = ret
ips = re.findall(pattern, string, flags=0)
# for x in ips:
# DebugPrint(x)
return ips
return []