# python 静态爬虫
# 1. 导入依赖
import requests
from bs4 import BeautifulSoup
# 2. 设置反爬
一般只需要获取这两个就可以模拟游览器操作了
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69",
"Referer": "https://display.xywy.com/",
}
# 3. 获取网页的静态资源
# 发起请求
#网页的链接
url = "http://jib.xywy.com/il_sii/gaishu/624.htm"
response = requests.get(url, headers=headers)
# 解析网页内容
soup = BeautifulSoup(response.text, "html.parser")
# 4. 查找需要获取的静态资源
# 4.1 通过 find_all () 查找返回的是一个数组(查找多个)
text = soup.find_all(attrs={'class':'pr'})[0].text
# 4.2 通过 find () 查找返回一个值(查找一个)
p1 = soup.find(attrs={'class': 'mt20 articl-know'})
# 4.3 查找规则
-
attrs={‘class’: ‘mt20 articl-know’) 为查找有 class 属性为 mt20 articl-know 的资源
-
“p” 表示查找 p 标签
-
.text 表示获取标签的文本内容
-
[“href”] 表示获取标签属性的值
# 5. 示例代码(爬取虎牙美女图片保存到当前目录的 image 文件中)
import io
import requests
from bs4 import BeautifulSoup
from PIL import Image
import os
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple 包名
def f(binary_data,name):
# 图片的二进制编码
# 创建保存图片的目录
os.makedirs('img', exist_ok=True)
# 将二进制编码转换为图像对象
image = Image.open(io.BytesIO(binary_data))
# import uuid
#
# # 生成一个随机的 UUID
# random_uuid = uuid.uuid4()
# 将 UUID 转换为字符串
# uuid_str = str(random_uuid)
# 保存图像对象为文件
image.save(f'image/{name}.png')
def crawl_images(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# print(soup)
# images = soup.find_all('img')
# # soup.find_all(attrs={'class': 'my-class'})
ul = soup.find(attrs={'class': 'layout-Cover-list'})
lis = ul.find_all(attrs={'class': 'layout-Cover-item'})
# print(lis)
for li in lis:
try:
image = li.find_all('source')
name = li.find(attrs={'class': 'DyListCover-userName is-template'})
# print(li)
print("-----------------------")
image_url = image[1]['srcset'].split()[0]
# href = li.find(attrs={"class": 'video-info clickstat'})['href']
title = name.text
print("imgName ===="+image_url)
response = requests.get(image_url, stream=True)
if response.status_code == 200:
print(f"Image URL: {image_url}")
# 打印图片内容到控制台
# print(type(response.content))
f(response.content,rf"{title}")
print("-----------------------")
except Exception as e:
print(f"Error: {str(e)}")
continue
urls = [
"https://www.douyu.com/g_HW"
]
for url in urls:
crawl_images(url)