Python3************
url
HTMLJson
get
Pythonurlliburllib2requestshttplib2
Requests
import requests
response = requests.get(url)
content = requests.get(url).content
print "response headers:", response.headers
print "content:", content
Urllib2
import urllib2
response = urllib2.urlopen(url)
content = urllib2.urlopen(url).read()
print "response headers:", response.headers
print "content:", content
Httplib2
import httplib2
http = httplib2.Http()
response_headers, content = http.request(url, 'GET')
print "response headers:", response_headers
print "content:", content
urlgeturl?url&
data = {'data1':'XXXXX', 'data2':'XXXXX'}
Requestsdatadictjson
import requests
response = requests.get(url=url, params=data)
Urllib2datastring
import urllib, urllib2
data = urllib.urlencode(data)
full_url = url+'?'+data
response = urllib2.urlopen(full_url)
**2.1 **
postcookie
data = {'data1':'XXXXX', 'data2':'XXXXX'}
Requestsdatadictjson
import requests
response = requests.post(url=url, data=data)
Urllib2datastring
import urllib, urllib2
data = urllib.urlencode(data)
req = urllib2.Request(url=url, data=data)
response = urllib2.urlopen(req)
2.2 cookie
cookiecookie
import requests
requests_session = requests.session()
response = requests_session.post(url=url_login, data=data)
response = requests_session.post(url=url_login, data=data)
response_captcha = requests_session.get(url=url_login, cookies=cookies)
response1 = requests.get(url_login) #
response2 = requests_session.get(url_login) # Response Cookie
response3 = requests_session.get(url_results) # Response Cookie
**3.1 **
IP
IPIP
proxies = {'http':'http://XX.XX.XX.XX:XXXX'}
Requests
import requests
response = requests.get(url=url, proxies=proxies)
Urllib2
import urllib2
proxy_support = urllib2.ProxyHandler(proxies)
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
urllib2.install_opener(opener) # openerurlopen()opener
response = urllib2.urlopen(url)
**3.2 **
RequestsUrllib2timesleep()
import time
time.sleep(1)
**3.3 **
User-AgentRefererRefererReferer
headers = {'User-Agent':'XXXXX'} #
headers = {'Referer':'XXXXX'}
headers = {'User-Agent':'XXXXX', 'Referer':'XXXXX'}
Requests
response = requests.get(url=url, headers=headers)
Urllib2
import urllib, urllib2
req = urllib2.Request(url=url, headers=headers)
response = urllib2.urlopen(req)
def multi_session(session, *arg):
retryTimes = 20
while retryTimes>0:
try:
return session.post(*arg)
except:
print '.',
retryTimes -= 1
def multi_open(opener, *arg):
retryTimes = 20
while retryTimes>0:
try:
return opener.open(*arg)
except:
print '.',
retryTimes -= 1
multi_sessionmulti_opensessionopener
Ajax
urlJavaScripturl
Google Chrome(NetworkGETTypetext/htmlgetRequest URL)
SeleniumSelenium
cookie
Tesseract-OCR
ScrapyTwistedPython
RobotsRobotsRobots Exclusion ProtocolRobots
robots.txt https://www.taobao.com/robots.txt robots.txt
User-agent:
Disallow:
Allow:
: "/"
User-agent: *
Disallow: /
User-agent: *
Disallow:
User-agent: BadBot
Disallow: /
User-agent: GoodBot
Disallow:
User-agent: *
Disallow: /images/
User-agent: *
Allow: /images/
Disallow: /
User-agent: *
Disallow: /*.html$
User-agent: *
Allow: /*.html$
Disallow: /