用Python获取代理ip

作者: admin 分类: Python 发布时间: 2017-07-27 15:58

业余学习scrapy,撸了一下反爬策略,继续怼了一波代理ip,

# /usr/bin python
# coding:utf-8 # author:2amor

import requests
from scrapy.selector import Selector
import pymysql

"""
create table proxy_ip(
    ip varchar(20) not null primary key,
    port varchar(255) not null,
    speed float,
    proxy_type varchar(5)

)
"""


#连接数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='spider', charset='utf8')
#创建游标
cursor = conn.cursor()

class GetIP(object):

    #从数据库中删除无效的ip
    def delete_ip(self,ip):
        delete_sql = """
                delete from proxy_ip where ip='{0}'
        """.format(ip)
        cursor.execute(delete_sql)
        conn.commit()
        return True

    #检测代理ip是否可用
    def judge_ip(self,ip,port):
        http_url = "http://www.baidu.com"
        proxy_url = "http://{0}:{1}".format(ip,port)
        proxy_dict = {
            "http":proxy_url
        }
        try:

            response = requests.get(url=http_url,proxies=proxy_url)
            return True
        except Exception as e:
            print("invalid ip and port")
            delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code >= 200 and code < 300:
                print("effective ip")                
                return True
            else:
                print("invalid ip and port")
                delete_ip(ip)
                return False



    #从数据库随机获取一个ip
    def get_random_ip(self):
        random_sql = """
                SELECT ip,port FROM proxy_ip ORDER BY RAND() LIMIT 1
            """
        result = cursor.execute(random_sql)
        for ip_info in cursor.fetchall():
            ip = ip_info[0]
            port = ip_info[1]

            judge_re = self.judge_ip(ip,port)
            if judge_re:
                return "http://{0}:{1}".format(ip,port)
            else:
                return self.get_random_ip()
        

#从西刺获取代理ip
def crawl_ips():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2"}
    for i in range(1568):
        re = requests.get("http://www.xicidaili.com/{0}".format(i), headers=headers)
        selector = Selector(text=re.text)
        all_trs = selector.css("#ip_list tr")
        ip_list = []
        for tr in all_trs[1:]:
            speed_str = tr.css(".bar::attr(title)").extract()[0]
            if speed_str:
                speed = float(speed_str.split("秒")[0])
            all_texts = tr.css("td::text").extract()
            ip = all_texts[0]
            port = all_texts[1]
            proxy_type = all_texts[5]
            ip_list.append((ip,port,proxy_type,speed))


        for ip_info in ip_list:
            cursor.execte(
                "insert into proxy_ip(ip,port,speed,proxy_type) VALUES('{0}','{1}','{2}','HTTP')".format(ip_info[0],ip_info[1],ip_info[3])

                )
            conn.commit()


if __name__ == '__main__':
    get_ip = GetIP()
    get_ip.get_random_ip()
可以去研究一下:
https://github.com/scrapy-plugins/scrapy-crawlera
使用tor代理
http://mars.run/2015/08/tor-for-proxy-pool/

github比较好的获取代理ip项目:
https://github.com/qiyeboy/IPProxyPool

一条评论
  • 做安全的叼毛

    2017年7月28日 下午11:15

    six six

发表评论

电子邮件地址不会被公开。 必填项已用*标注

标签云