proxyPass = "0B43E0B5352C5033"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host" : proxyHost,
"port" : proxyPort,
"user" : proxyUser,
"pass" : proxyPass,
}
proxies = {
"http" : proxyMeta,
"https" : proxyMeta,
}
html = requests.get(url,headers=headers,timeout=30)
code = html.encoding
return html.content
def date(timeStamp):
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
def getContent(word,client):
if client == 'pc':
pcurl = 'http://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word
print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % pcurl
html = getHTml(pcurl,client)
html_dict = json.loads(html)
for tag in html_dict['feed']['entry']:
if tag.has_key('title'):
title = tag['title']
url = tag['url']
rank = tag['pn']
time = date(tag['time'])
outfile.write('%s,%s,%s,%s,%s\n' % (word,rank,url,title,time))
print rank,url
return 1
elif client == 'wap':
wapurl = 'http://m.baidu.com/s?pn=0&usm=2&word=%s&sa=np' % word
print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % wapurl
html = getHTml(wapurl,client)
tree = etree.HTML(html)
div = tree.xpath('//*[@id="results"]/div')
for line in div:
line_html = etree.tostring(line)
print line_html
title = re.sub('<[^>]*?>','',search(r'
class="c-title[^>]*?>([\s\S]*?)</h3>',line_html))
rank = search(r'order="(\d+)"',line_html)
domain = search(r'
]*?>(.*?)',line_html)
if domain == 'no':
domain = search(r'
(.*?)\s+\d+k
',line_html)
if domain == 'no':
domain = search(r'(.*?)',line_html)
if domain == 'no':
domain = search(r'
(.*?)
',line_html)
if domain == 'no':
domain = search('(.*?)',line_html)
if domain == 'no':
domain = search(r'
(.*?) \d+k,line_html)
if domain == 'no':
domain = '搜索特型'
print rank,domain
outfile.write('%s,%s,%s\n' % (word,rank,domain))
return 1
elif client == 'xgss':
print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % word
url = 'http://www.baidu.com/s?wd=%s&tn=baidurs2top' % word
try:
html = getHTml(url,client)
for i in html.split(','):
print i
outfile.write('"%s","%s"\n' % (word,i))
except:
print 'Error'
elif client == "shoulu":
print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" % word
url = "https://www.baidu.com/s?wd=%s&pn=0&rn=1&tn=json" % word
html = getHTml(url,client)
html_dict = json.loads(html)
if html_dict['feed']['entry'] == [{}]:
include = "未收录"
else:
line = html_dict['feed']['entry'][0]
link = line["url"]
date = line["time"]
include = date
print url,include
outfile.write("%s,%s\n" % (url,include))
else:
return 'Error'
words = open(wordfile).readlines()
pool = multiprocessing.Pool(processes=3)
for word in words:
word = word.strip()
pool.apply_async(getContent, (word,client ))
pool.close()
pool.join()
我要评论