检查网站连通性
# 检查网站连通性
import urllib.request,time
opener = urllib.request.build_opener()
opener.addheader = [('User-agent','Mozilla/49.0.2')]
#放网址的文件,一行一行地放
file = open('url.txt')
lines = file.readlines()
url_lst = []
# 转换成list
for x in lines:
temp = line.replace('\n',' ')
url_lst.append(temp)
pass
print(url_lst)
for the_url in url_lst:
try:
opener.open(the_url)
print(the_url+'没问题')
except urllib.error.HTTPError:
print(the_url+'访问页面出错')
continue
except urllib.error.URLError:
print(the_url+'访问页面出错')
continue
CURL利用IO随便返回页面
# CURL利用IO随便返回页面,不会打印出网页HTML页面,然后返回页面状态码
import pycurl
from io import BytesIO
def connect_check(target_url):
the_cul = pycurl.Curl()
bio_str = BytesIO()#仅用于忽略返回的页面
the_cul.setopt(pycurl.CONNECTTIMEOUT,60)# 连接等待时间
the_cul.setopt(pycurl.URL,target_url)
the_cul.setopt(pycurl.WRITEFUNCTION,bio_str.write)
the_cul.setopt(pycurl.FORBID_RESUE,1)# 完成交互后强制断开连接,不重复用
the_cul.setopt(pycurl.FRESH_CONNECT,1)
try:
the_cul.perform()
the_cod = the_cod.getinfo(pycurl.HTTP_CODE)
return the_cod
if the_cod == 200:
return True
pass
the_cul.close()
except Exception as e:
return False
the_cul.close()
print(connect_check('https://google.com'))
多线程爬取URL状态
# 多线程爬取URL状态
import urllib
import os
import time
import threading
result = []
# 将扫描的结果写到文件中
def record_result(con):
fil = os.getcwd()+'/'+"result.log"
out = open(fil, "a", encoding='utf-8')
out.write(con+"\n")
out.close()
# 获取URL的HTTP状态
def get_status(url):
try:
code = urllib.request.urlopen(url, timeout=1).getcode()
if code == 200:
result.append(url)
record_result(url)
except Exception as e:
pass
# 标准URL
# https://bwoil-fileserver.oss-cn-shenzhen.aliyuncs.com/2009281519005901954.xlsx
nus = 0
tim = 1601222400
end = 1601265600
# end = 1601308800
# 线程执行函数,即生成URL不断去尝试
def gen_url(num):
global nus,tim
while tim < end:
dat = time.strftime("%m%d%H%M%S", time.localtime(tim))
urn = dat+str(nus).zfill(7)
print(urn)
url = "https://bwoil-fileserver.oss-cn-shenzhen.aliyuncs.com/20"+urn
get_status(url+".jpg")
get_status(url+".png")
get_status(url+".xlsx")
if nus == 9999999:
nus = 0
tim+=1
else:
nus+=1
def main():
# 主线程
print('这是主线程:', threading.current_thread().name)
thread_list = []
# 循环创建线程
for i in range(1000):
t = threading.Thread(target=gen_url,args=(i,))
thread_list.append(t)
# 循环开始线程
for t in thread_list:
t.start()
time.sleep(1)
print('主线程结束!'+threading.current_thread().name)
# main()
print(result)
2 comments
想想你的文章写的特别好https://www.ea55.com/
看的我热血沸腾啊https://www.ea55.com/