'爬虫练习脚本'

发表于 2020-05-12 更新于 2024-01-18 分类于 python 阅读次数：

西拉代理爬虫，判断可用性，入库

加了多线程，不过这站好像有点问题，正在写其它站的爬虫

# -*-coding:utf-8-*-
import requests
from bs4 import BeautifulSoup
import requests
import threading
import pymysql
from socket import timeout
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
from fake_useragent import UserAgent
import time
#Author:sunian

port_list = []
OK_list = []
def get_data():
    for x in range(1,200):
        ua = UserAgent()
        headers = {'User-Agent': ua.random}
        url = "http://www.xiladaili.com/gaoni/{0}/".format(x)
        try:
            r = requests.get(url, headers=headers).text
            soup = BeautifulSoup(r, "html.parser")
            ips = soup.findAll('tr')
            num = 0
            for x in range(1, len(ips)):
                ip = ips[x]
                tds = ip.findAll("td")
                ip_temp = tds[0].contents[0]  # IP地址
                port_list.append(ip_temp)
        except (URLError,URLError) as error:
            print(error)
        except:
            pass

def validateIp(http_proxy):
    proxy = http_proxy
    proxy_handler = ProxyHandler({
        'http': 'http://' + proxy,
        'https': 'https://' + proxy
    })
    opener = build_opener(proxy_handler)
    try:
        response = opener.open('https://tbip.alicdn.com/api/getipinfo?callback=taobao_callback', timeout=3)
        if 'taobao_callback' in (response.read().decode('utf-8')):
            response.close()
            OK_list.append(http_proxy)
            #若多线程抢占数据库资源，百度死锁可解决
            WriteMysql(proxy)
        else:
            return False
    except (URLError,URLError) as error:
        print(error.reason)
    except timeout:
        print('超时',proxy)
        return False

def WriteMysql(ip_temp):
    db = pymysql.connect(db='xiciip', host='localhost', port=3306, user='root', passwd='root',charset='utf8')
    cursor = db.cursor()
    sql = "insert into `xiciip` VALUE('%s')"%(ip_temp)
    cursor.execute(sql)
    db.commit()

def run(thread_function, *args):
    threads = []
    for par in args[0]:
        thread = threading.Thread(target=thread_function,args=(par,))
        threads.append(thread)
    for i in threads:
        i.start()
    for t in threads:
        t.join()


def main():
    get_data()
    thread_num = 50 #这里设置线程
    for nu in range(0, len(port_list), thread_num+1):
        start = nu
        end = nu + thread_num
        run(validateIp, port_list[start:end])

if __name__ == '__main__':
    print('扫描开始')
    start = time.time()
    main()
    end = time.time()
    print('扫描完毕，用时：',end-start)

    for i in OK_list:
        print(i)

妹子图爬虫

5分钟爬取2500页

import requests
from fake_useragent import UserAgent
import re
from multiprocessing.pool import ThreadPool

ua = UserAgent()
pool = ThreadPool(100)

def get_tu(num):
    for x in range(1, num):
        headers = {'User-Agent': ua.random,
                   "Referer": "https://www.mzitu.com/",}
        url = "https://www.mzitu.com/tag/beautyleg/page/{0}/".format(x)
        try:
            r = requests.get(url, headers=headers)
            srcs = re.findall('li>.*?<a href="(.*?)"',r.text,re.S)[1:]
            for src_url in srcs:
                for x in range(1, 100):
                    print(src_url)
                    pool.apply_async(get_pian, args=(src_url, x))
        except Exception as e:
            print(e)

def get_pian(src_url,num):
    headers = {'User-Agent': ua.random,
               "Referer": src_url, }
    try:
        url = src_url+"/{0}/".format(num)
        r = requests.get(url, headers=headers)
        if "404 - 妹子图" in r.text:
            pass
        imgs = re.findall('<img src="(.*?)" ', r.text, re.S)
        for img in imgs:
            jieguo = requests.get(img,headers=headers).content
            filepath = 'D:/meizi/%s' % (img.rsplit('/',maxsplit=1)[1])
            print(filepath)
            with open(filepath,'wb') as f:
                f.write(jieguo)
    except:
        pass

if __name__ == '__main__':
    get_tu(2)
    pool.close()
    pool.join()