关于python:☀️SVG映射反爬示例练习⚡直接提取SVG文字图片的文本⚡

6次阅读

共计 1895 个字符,预计需要花费 5 分钟才能阅读完成。

import re
from requests_html import HTMLSession
from selenium import webdriver
from bisect import bisect
def parseAndReplaceSvgNode(d_tags):

for d_tag in d_tags:
    position = d_tag.value_of_css_property("background-position")
    x, y = map(int, re.findall("\d+", position))
    num = data[bisect(ys, y)][bisect(xs, x)]
    # 替换节点为一般文本
    browser.execute_script(f"""
        var element = arguments[0];
        element.parentNode.replaceChild(document.createTextNode("{num}"), element);
    """, d_tag)

browser = webdriver.Chrome()
url = ‘http://www.porters.vip/confus…’
browser.get(url)
d_tag = Skrill 下载 browser.find_element_by_css_selector(‘d[class^=”vhk”]’)
background_image_url = d_tag.value_of_css_property(“background-image”)
svg_url = background_image_url[5:-2]
session = HTMLSession()
html_session = session.get(svg_url)
xs = []
ys = []
data = []
for text_tag in html_session.html.xpath(r”//text”):

if not xs:
    xs.extend(map(int, text_tag.xpath(".//@x")[0].split()))
ys.append(int(text_tag.xpath(".//@y")[0]))
data.append(list(text_tag.xpath(".//text()")[0]))

一次性替换掉整个 DOM 中所有的 svg 节点为对应的文本

parseAndReplaceSvgNode(

browser.find_elements_by_css_selector('d[class^="vhk"]'))

删除 a 标签

element = browser.find_element_by_css_selector(‘.title a’)
browser.execute_script(“””
var element = arguments[0];
element.parentNode.removeChild(element);
“””, element)

获取题目

title = browser.find_element_by_class_name(“title”).text

获取评论

comment = browser.find_element_by_class_name(“comments”).text

人均

avgPrice = browser.find_element_by_class_name(‘avgPriceTitle’).text

口味、环境、服务

comment_score_tags = browser.find_elements_by_css_selector(

".comment_score .item")

taste = comment_score_tags[0].text
environment = comment_score_tags[1].text
service = comment_score_tags[2].text

地址

address = browser.find_element_by_css_selector(‘.address .address_detail’).text

特色

characteristic = browser.find_element_by_css_selector(

'.characteristic .info-name').text

电话

phone = browser.find_element_by_class_name(“more”).text
print(title, comment, avgPrice, taste, environment,

  service, address, characteristic, phone)
正文完
 0