【SEO工具】百度PC排名、移动排名、PC相关搜索词、PC收录 02

2017-06-21 09:09:57 admin SEO知识 GOGO闯 www.kaopuseo.com 收藏

 proxyPass = "0B43E0B5352C5033"
 
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
      "host" : proxyHost,
      "port" : proxyPort,
      "user" : proxyUser,
      "pass" : proxyPass,
    }
 
    proxies = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }
 
 
    html = requests.get(url,headers=headers,timeout=30)
    code = html.encoding
    return html.content
 
def date(timeStamp):
    timeArray = time.localtime(timeStamp)
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    return otherStyleTime
 
def getContent(word,client):
    #查百度PC排名
    if client == 'pc':
        pcurl = 'http://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % pcurl
        html = getHTml(pcurl,client)
 
        html_dict = json.loads(html)
        for tag in html_dict['feed']['entry']:
            if tag.has_key('title'):
                title = tag['title']
                url = tag['url']
                rank = tag['pn']
                time = date(tag['time'])
                outfile.write('%s,%s,%s,%s,%s\n' % (word,rank,url,title,time))
                print rank,url
        return 1
    #查百度移动排名
    elif client == 'wap':
        wapurl = 'http://m.baidu.com/s?pn=0&usm=2&word=%s&sa=np' % word
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % wapurl
        html = getHTml(wapurl,client)
 
        tree = etree.HTML(html) 
        div = tree.xpath('//*[@id="results"]/div')  # 通过xpath路径提取元素
        for line in div:
            line_html = etree.tostring(line)    #通过etree.tostring方法得到一个html
            print line_html
 
            title = re.sub('<[^>]*?>','',search(r'
	class="c-title[^>]*?>([\s\S]*?)</h3>',line_html))
	
            rank = search(r'order="(\d+)"',line_html)
            domain = search(r']*?>(.*?)',line_html)
            if domain == 'no':
                domain = search(r'(.*?)\s+\d+k
',line_html)
            if domain == 'no':
                domain = search(r'(.*?)',line_html)
            if domain == 'no':
                domain = search(r'(.*?)
',line_html)
            if domain == 'no':
                domain = search('(.*?)',line_html)
            if domain == 'no':
                domain = search(r'(.*?) \d+k,line_html)
            if domain == 'no':
                domain = '搜索特型'
            print rank,domain
            outfile.write('%s,%s,%s\n' % (word,rank,domain))
        return 1
    #获取pc百度相关搜索词
    elif client == 'xgss':
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % word
        url = 'http://www.baidu.com/s?wd=%s&tn=baidurs2top' % word
        try:
            html = getHTml(url,client)
            for i in html.split(','):
                print i
                outfile.write('"%s","%s"\n' % (word,i))
        except:
            print 'Error'
    #查pc百度收录数据
    elif client == "shoulu":
        print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" % word
        url = "https://www.baidu.com/s?wd=%s&pn=0&rn=1&tn=json" % word
        html = getHTml(url,client)
        html_dict = json.loads(html)
        if html_dict['feed']['entry'] == [{}]:
            include = "未收录"
        else:
            line = html_dict['feed']['entry'][0]
            link = line["url"]
            date = line["time"]
 
            include = date
 
        print url,include
 
        outfile.write("%s,%s\n" % (url,include))
 
    else:
        return 'Error'
 
 
 
words = open(wordfile).readlines()
pool = multiprocessing.Pool(processes=3)
for word in words:
    word = word.strip()
    pool.apply_async(getContent, (word,client ))
pool.close()
pool.join()

执行本脚本，依次输入要执行的部分，如“pc–查询pc排名、xgss–跑相关搜索….”和存放url、关键词文件的路径
代理程序默认使用阿布云动态代理
默认使用多线程，线程数为3，可根据需求自行调整线程数。更改pool = multiprocessing.Pool(processes=3)中processes的值即可