检查网站连通性

# 检查网站连通性
import urllib.request,time
opener =  urllib.request.build_opener()
opener.addheader = [('User-agent','Mozilla/49.0.2')]
#放网址的文件,一行一行地放
file = open('url.txt')
lines = file.readlines()
url_lst = []
# 转换成list
for x in lines:
        temp = line.replace('\n',' ')
        url_lst.append(temp)
        pass
print(url_lst)

for the_url in url_lst:
        try:
                opener.open(the_url)
                print(the_url+'没问题')
        except urllib.error.HTTPError:
                print(the_url+'访问页面出错')
                continue
        except urllib.error.URLError:
                print(the_url+'访问页面出错')
                continue

CURL利用IO随便返回页面

# CURL利用IO随便返回页面,不会打印出网页HTML页面,然后返回页面状态码
import pycurl
from io import BytesIO

def connect_check(target_url):
        the_cul = pycurl.Curl()
        bio_str = BytesIO()#仅用于忽略返回的页面
        the_cul.setopt(pycurl.CONNECTTIMEOUT,60)# 连接等待时间
        the_cul.setopt(pycurl.URL,target_url)
        the_cul.setopt(pycurl.WRITEFUNCTION,bio_str.write)
        the_cul.setopt(pycurl.FORBID_RESUE,1)# 完成交互后强制断开连接,不重复用
        the_cul.setopt(pycurl.FRESH_CONNECT,1)
        try:
                the_cul.perform()
                the_cod = the_cod.getinfo(pycurl.HTTP_CODE)
                return the_cod
                if the_cod == 200:
                        return True
                        pass
                the_cul.close()
        except Exception as e:
                return False
                the_cul.close()

print(connect_check('https://google.com'))

多线程爬取URL状态

# 多线程爬取URL状态
import urllib
import os
import time
import threading

result = []

# 将扫描的结果写到文件中
def record_result(con):
    fil = os.getcwd()+'/'+"result.log"
    out = open(fil, "a", encoding='utf-8')
    out.write(con+"\n")
    out.close()

# 获取URL的HTTP状态
def get_status(url):
    try:
        code = urllib.request.urlopen(url, timeout=1).getcode()
        if code == 200:
            result.append(url)
            record_result(url)
    except Exception as e:
        pass

# 标准URL
# https://bwoil-fileserver.oss-cn-shenzhen.aliyuncs.com/2009281519005901954.xlsx
nus = 0
tim = 1601222400
end = 1601265600
# end = 1601308800

# 线程执行函数,即生成URL不断去尝试
def gen_url(num):
    global nus,tim
    while tim < end:
        dat = time.strftime("%m%d%H%M%S", time.localtime(tim))
        urn = dat+str(nus).zfill(7)
        print(urn)
        url = "https://bwoil-fileserver.oss-cn-shenzhen.aliyuncs.com/20"+urn
        get_status(url+".jpg")
        get_status(url+".png")
        get_status(url+".xlsx")
        if nus == 9999999:
            nus = 0
            tim+=1
        else:
            nus+=1

def main():
    # 主线程
    print('这是主线程:', threading.current_thread().name)

    thread_list = []
    # 循环创建线程
    for i in range(1000):
        t = threading.Thread(target=gen_url,args=(i,))
        thread_list.append(t)
    # 循环开始线程
    for t in thread_list:
        t.start()

    time.sleep(1)
    print('主线程结束!'+threading.current_thread().name)

# main()
print(result)
Last modification:February 28th, 2021 at 02:51 pm
硬币投入口