Python多线程爬取URL状态

Author： Mia
发布时间：December 15, 2020
1462views
No comments
1447 words
Categories：通用技术

# 多线程爬取URL状态，检查URL是否有效
import urllib
import os
import time
import threading

result = []

# 将扫描的结果写到文件中
def record_result(con):
    fil = os.getcwd()+'/'+"result.log"
    out = open(fil, "a", encoding='utf-8')
    out.write(con+"\n")
    out.close()

# 获取URL的HTTP状态
def get_status(url):
    try:
        code = urllib.request.urlopen(url, timeout=1).getcode()
        if code == 200:
            result.append(url)
            record_result(url)
    except Exception as e:
        pass

# 标准URL
nus = 0
tim = 1601222400
end = 1601265600
# end = 1601308800

# 线程执行函数，即生成URL不断去尝试
def gen_url(num):
    global nus,tim
    while tim < end:
        dat = time.strftime("%m%d%H%M%S", time.localtime(tim))
        urn = dat+str(nus).zfill(7)
        print(urn)
        url = "https://aliyuncs.com/20"+urn
        get_status(url+".jpg")
        get_status(url+".png")
        get_status(url+".xlsx")
        if nus == 9999999:
            nus = 0
            tim+=1
        else:
            nus+=1

def main():
    # 主线程
    print('这是主线程：', threading.current_thread().name)

    thread_list = []
    # 循环创建线程
    for i in range(1000):
        t = threading.Thread(target=gen_url,args=(i,))
        thread_list.append(t)
    # 循环开始线程
    for t in thread_list:
        t.start()

    time.sleep(1)
    print('主线程结束！'+threading.current_thread().name)

# main()
print(result)

Last modification：February 28th, 2021 at 03:38 pm

© 允许规范转载

硬币投入口

Leave a Comment Cancel reply

Comment *

私密评论

Name *

🎲

Email *

Site

Keepalive
评论数： 10
Java and IDEA
评论数： 10
VMware Workstation安装安卓虚拟机
评论数： 10
TLS/SSL OpenSSL自签证书制作
评论数： 6
Azure CLI教程
评论数： 5

obaovfiiqd
这篇文章不错！
twqfqzxfvr
这篇文章不错！
kaxuendiwz
立意高远，以小见大，引发读者对社会/人性的深层共鸣。
fxajcfcbux
文字流畅如丝，语言优美动人，读来令人心旷神怡。
rbrbxkgnmg
字里行间流露出真挚的情感，让人感同身受，共鸣不已。

Python多线程爬取URL状态

Mia • 2020 年 12 月 15 日

# 多线程爬取URL状态，检查URL是否有效
import urllib
import os
import time
import threading

result = []

# 将扫描的结果写到文件中
def record_result(con):
    fil = os.getcwd()+'/'+"result.log"
    out = open(fil, "a", encoding='utf-8')
    out.write(con+"\n")
    out.close()

# 获取URL的HTTP状态
def get_status(url):
    try:
        code = urllib.request.urlopen(url, timeout=1).getcode()
        if code == 200:
            result.append(url)
            record_result(url)
    except Exception as e:
        pass

# 标准URL
nus = 0
tim = 1601222400
end = 1601265600
# end = 1601308800

# 线程执行函数，即生成URL不断去尝试
def gen_url(num):
    global nus,tim
    while tim < end:
        dat = time.strftime("%m%d%H%M%S", time.localtime(tim))
        urn = dat+str(nus).zfill(7)
        print(urn)
        url = "https://aliyuncs.com/20"+urn
        get_status(url+".jpg")
        get_status(url+".png")
        get_status(url+".xlsx")
        if nus == 9999999:
            nus = 0
            tim+=1
        else:
            nus+=1

def main():
    # 主线程
    print('这是主线程：', threading.current_thread().name)

    thread_list = []
    # 循环创建线程
    for i in range(1000):
        t = threading.Thread(target=gen_url,args=(i,))
        thread_list.append(t)
    # 循环开始线程
    for t in thread_list:
        t.start()

    time.sleep(1)
    print('主线程结束！'+threading.current_thread().name)

# main()
print(result)