proxyPass
=
"0B43E0B5352C5033"
proxyMeta
=
"http://%(user)s:%(pass)s@%(host)s:%(port)s"
%
{
"host"
: proxyHost,
"port"
: proxyPort,
"user"
: proxyUser,
"pass"
: proxyPass,
}
proxies
=
{
"http"
: proxyMeta,
"https"
: proxyMeta,
}
html
=
requests.get(url,headers
=
headers,timeout
=
30
)
code
=
html.encoding
return
html.content
def
date(timeStamp):
timeArray
=
time.localtime(timeStamp)
otherStyleTime
=
time.strftime(
"%Y-%m-%d %H:%M:%S"
, timeArray)
return
otherStyleTime
def
getContent(word,client):
if
client
=
=
'pc'
:
pcurl
=
'http://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10'
%
word
print
'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
%
pcurl
html
=
getHTml(pcurl,client)
html_dict
=
json.loads(html)
for
tag
in
html_dict[
'feed'
][
'entry'
]:
if
tag.has_key(
'title'
):
title
=
tag[
'title'
]
url
=
tag[
'url'
]
rank
=
tag[
'pn'
]
time
=
date(tag[
'time'
])
outfile.write(
'%s,%s,%s,%s,%s\n'
%
(word,rank,url,title,time))
print
rank,url
return
1
elif
client
=
=
'wap'
:
wapurl
=
'http://m.baidu.com/s?pn=0&usm=2&word=%s&sa=np'
%
word
print
'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
%
wapurl
html
=
getHTml(wapurl,client)
tree
=
etree.HTML(html)
div
=
tree.xpath(
'//*[@id="results"]/div'
)
for
line
in
div:
line_html
=
etree.tostring(line)
print
line_html
title
=
re.sub(
'<[^>]*?>'
,'
',search(r'
class
=
"c
-
title[^>]
*
?>([\s\S]
*
?)<
/
h3>',line_html))
rank
=
search(r
'order="(\d+)"'
,line_html)
domain
=
search(r
'
]*?>(.*?)'
,line_html)
if
domain
=
=
'no'
:
domain
=
search(r
'
(.*?)\s+\d+k
'
,line_html)
if
domain
=
=
'no'
:
domain
=
search(r
'(.*?)'
,line_html)
if
domain
=
=
'no'
:
domain
=
search(r
'
(.*?)
'
,line_html)
if
domain
=
=
'no'
:
domain
=
search(
'(.*?)'
,line_html)
if
domain
=
=
'no'
:
domain
=
search(r
'
(.*?) \d+k,line_html)
if
domain
=
=
'no'
:
domain
=
'搜索特型'
print
rank,domain
outfile.write(
'%s,%s,%s\n'
%
(word,rank,domain))
return
1
elif
client
=
=
'xgss'
:
print
'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
%
word
url
=
'http://www.baidu.com/s?wd=%s&tn=baidurs2top'
%
word
try
:
html
=
getHTml(url,client)
for
i
in
html.split(
','
):
print
i
outfile.write(
'"%s","%s"\n'
%
(word,i))
except
:
print
'Error'
elif
client
=
=
"shoulu"
:
print
"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
%
word
url
=
"https://www.baidu.com/s?wd=%s&pn=0&rn=1&tn=json"
%
word
html
=
getHTml(url,client)
html_dict
=
json.loads(html)
if
html_dict[
'feed'
][
'entry'
]
=
=
[{}]:
include
=
"未收录"
else
:
line
=
html_dict[
'feed'
][
'entry'
][
0
]
link
=
line[
"url"
]
date
=
line[
"time"
]
include
=
date
print
url,include
outfile.write(
"%s,%s\n"
%
(url,include))
else
:
return
'Error'
words
=
open
(wordfile).readlines()
pool
=
multiprocessing.Pool(processes
=
3
)
for
word
in
words:
word
=
word.strip()
pool.apply_async(getContent, (word,client ))
pool.close()
pool.join()
我要评论