Python selenium chrome parses blob url
When crawling video sites, many videos use blob url to hide the source address. Through F12, you can find the download address of m3u8 corresponding to the request, but the amount of crawled data is very large. Use f12 to analyze each one. Obviously impossible, this article will parse the blob URL through selenium chrome to get the source m3u8 address
Example in this article: twitter preview link.
url : https://ton.twimg.com/ads-manager/tweet-preview/index.html?data=H4sIAAAAAAAAAO1YbW%2FbNhD%2BK4T2caotWbIkG9iHNE3TdV3Wpi3QFxUCLVE2U4nUSMqJE%2Fi%2F747yixI76AYUxQYsQGLzeHc8Hu8ePsydY64ZM86U3Dm5YtSwIqMwdJ4rTi7kkoxC4o2mgTf1RuRnD37IyPNjx3V44UwnQZAk42DiJ9E4jsZJMkF5po0CD4eTYFW2VZUZdoNLnFWEpq3nlb4kDdW0kC6pUcB8TQpGJrBYzSsC2oqKK6akdkmj5JIJzoRhVikOwNYaFZrBPKjO6BX8CsIE%2Bel0wQV1SSsoyXmpKBEtWzKyZDmY13QlFfmzZWRBc0YCbxePHpCFMY2epsN0aAa5TIfq1%2BfFb6eX6o8AtmFUK3JMljMtaaWZ6xRcNxVd2b1lEO6cOdPPnusH8RfXgWi54Uw70ztnQfXC0Dl8%2F3znbDJho8ScioLnqPY5Cd2J92UNtnpVz2SFMhi0mqmsRndSbEVqO1mzglPrdnc23siPx%2BMkDIJ4Mnp4Nvcm763uB4nrR%2F7WZwZrgBFmxCakmemBueb1HBJTp0Orkw6fvf5wmbHo%2FdnJSfDk03hw1cydnoPMJnTjRv9jP%2FsQHj2U7RF0qg3P0bkxTHXu7%2Bmym4aKAor9wG%2FfJJ8bwQTToCurdKgNNa1Oh4eVnQ6XvGAQkI%2FFsWoYBrCQRsJQ89vu5M2irWf45dqZ%2BmPPdRabT8VQB0xyJRtn7ToVVVg%2FVjMKO80g6muW3KAipqyt%2F46mrmlVfVsRNEtAgVbZkNfrL2vMlWE2V%2F0y%2Fr%2FY%2FlXFZiX%2FwWKzcWdclBLVqW5YjvAJ%2BIaVEbkTKIuitQKRwWVQcdidPwkj3wdjqjgVpoPSGQfoN%2BA7CUZwTblOLuGOECbrZwjOvQmPHLCd7JcHrZuKl6tsY3VYwtYmHcLubmBn6fBlefGSX52%2FvXwSv5BPl8EAF1q7vbjAOvpxgfmjxLuJRxDZs2V76S0L%2FeaqZefmen4YmU3YjwoMFrvxE4jrbEbP9TP%2FqfyQxRfvrukurgdh0AYc57YC0uHNk7ph8%2FeXr75TSA102ou4oUvvZHX29fLs9uPzZlAHbeJY6Otj4RZevrIVLOsH2VFco0XBMVRaZZ36trYBOivcDmIH07niDap1AlbPWFHQGSoAtQA6UUvBDL%2FtRJZidFisZatydINEJcgpWShW%2FpL2sqBNW3B5D40colgFSkKWsqrkddpZs3edDvkdAyVvrWHnF3LXqVikzhRrKqA2MutAKUPIF8DkHpnscP9QwbKX47abqccsIV%2BMiUzQmm2n0QLziv7iyA%2BjiTcO%2FAQIlxdEfR56OAmb6jw5p%2BfvLpAnnulmS%2F0qRNH%2Bcs49YIbZSnbVaE8B3MZpGzKauwQ%2FRwXKQobfZ%2BAzyn0fJH6Mf8Mkh79RWRyp3o3%2FAa6Gp3ZQJ29lLTWZq7aRSHyxumBct5XhFRPzll4xsurGcDMZWkpV0wF5BUS3gRKGjqLooKKaCGl4zqkl0JZ7gmHBKjAWhbSEGJ4CFfJMRnjdSGWoGEBAQL4NAHSP9nYFxZTOctkK4LKRF46TGCYUcPRiJ%2FZDkAF44%2FtiI4K2cQ%2FfHCftnHhj4sXTcDL1wv2bI8LXA11C%2FXOg%2FlsnQTAJIZcmz2RZanzIdNVheM2yW2iirWAO1wwT2E%2FFtseWTPGS78dd%2FfZ8J%2BMoxPtRzLdeEJoUn7VGwo537ja54DrDh4qG5Et1TIp32EMjSGkJ55fNaP51rmBhzE%2BF9o5nf5yjOrymc%2FaQKtH7FMfqAIMwC1bvPv10OJsPGjH%2Fht8jDOo7uMdSPdj60b08oGv3dDVeb2EURsk4nkSjUWgTlQ5Hrz%2BJN1fqYyaw9KsNmTtY5tvk8DusBg9QAZD2sMsfWajThpUOwQpWHycR1P8oDHv%2BKy6%2B7iplFITx5LQ3q%2BHGm1GVzaQqIIpHK2qrB4PqcS37pn10FlrmoIL2z2JWUkCkbKP8mPyBVYcqfN93nQDuA8AmbTIEs%2B0UYlm54Qd61%2F67PtyyCIFYsLY4sG9myA6An2E7OwDOnB1r9q1MMfv%2Fkh2sRR0oISbtYMPby%2FZ9vjHsd77Ums%2FgfoPdaOAMy%2F3%2BO8hxIK61SzDRteyAU3T%2FMCB3OyEMEL3W678A5q3yWMoRAAA%3D
Selenium intercepts all requests, which is equivalent to the network all function in F12
Parse the request log, get the m3u8 address, download the video through ffmpy3, and finally parse the video through opencv to get the attribute value of the video
pip install selenium ffmpy3 opencv-python
code:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
import re
from selenium.webdriver.chrome.options import Options
from ffmpy3 import FFmpeg
import cv2
def download_video(video_m3u8_url):
try:
outputs_path = r'/data/test.mp4'
a = FFmpeg(inputs={video_m3u8_url: None}, outputs={outputs_path: None})
a.run()
return outputs_path
except Exception as e:
print(e)
def parse_blob(url):
chromeOpitons = Options()
prefs = {
"profile.managed_default_content_settings.images": 1,
"profile.content_settings.plugin_whitelist.adobe-flash-player": 1,
"profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player": 1,
}
chromeOpitons.add_experimental_option('prefs', prefs)
d = DesiredCapabilities.CHROME
d['goog:loggingPrefs'] = {'performance': 'ALL'}
chromeOpitons.add_experimental_option('w3c', False)
browser = webdriver.Chrome(desired_capabilities=d, chrome_options=chromeOpitons)
try:
browser.get(url)
browser.implicitly_wait(30)
performance_log = browser.get_log('performance')
for i in performance_log:
response = json.loads(i.get('message'))
if response.get('message').get('method') == 'Network.requestWillBeSentExtraInfo':
message = eval(i.get('message'))
headers = message.get('message').get('params').get('headers')
path = headers.get(':path')
if path:
find_content = re.findall(r'.m3u8', path)
if find_content:
authority = headers.get(':authority')
method = headers.get(':method')
scheme = headers.get(':scheme')
video_m3u8_url = scheme + '://' + authority + path
return video_m3u8_url
except Exception as e:
print(e)
finally:
browser.quit()
def get_video_attribute_value(video_path):
try:
cap = cv2.VideoCapture(video_path)
fps = int(round(cap.get(cv2.CAP_PROP_FPS)))
print('fps:', fps)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
print('width:', width)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print('height:', height)
frame_counter = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.release()
cv2.destroyAllWindows()
print('frame_counter:', frame_counter)
duration = frame_counter / fps
print('duration:', duration)
except Exception as e:
print(e)
if __name__ == '__main__':
url = 'https://ton.twimg.com/ads-manager/tweet-preview/index.html?data=H4sIAAAAAAAAAO1YbW%2FbNhD%2BK4T2caotWbIkG9iHNE3TdV3Wpi3QFxUCLVE2U4nUSMqJE%2Fi%2F747yixI76AYUxQYsQGLzeHc8Hu8ePsydY64ZM86U3Dm5YtSwIqMwdJ4rTi7kkoxC4o2mgTf1RuRnD37IyPNjx3V44UwnQZAk42DiJ9E4jsZJMkF5po0CD4eTYFW2VZUZdoNLnFWEpq3nlb4kDdW0kC6pUcB8TQpGJrBYzSsC2oqKK6akdkmj5JIJzoRhVikOwNYaFZrBPKjO6BX8CsIE%2Bel0wQV1SSsoyXmpKBEtWzKyZDmY13QlFfmzZWRBc0YCbxePHpCFMY2epsN0aAa5TIfq1%2BfFb6eX6o8AtmFUK3JMljMtaaWZ6xRcNxVd2b1lEO6cOdPPnusH8RfXgWi54Uw70ztnQfXC0Dl8%2F3znbDJho8ScioLnqPY5Cd2J92UNtnpVz2SFMhi0mqmsRndSbEVqO1mzglPrdnc23siPx%2BMkDIJ4Mnp4Nvcm763uB4nrR%2F7WZwZrgBFmxCakmemBueb1HBJTp0Orkw6fvf5wmbHo%2FdnJSfDk03hw1cydnoPMJnTjRv9jP%2FsQHj2U7RF0qg3P0bkxTHXu7%2Bmym4aKAor9wG%2FfJJ8bwQTToCurdKgNNa1Oh4eVnQ6XvGAQkI%2FFsWoYBrCQRsJQ89vu5M2irWf45dqZ%2BmPPdRabT8VQB0xyJRtn7ToVVVg%2FVjMKO80g6muW3KAipqyt%2F46mrmlVfVsRNEtAgVbZkNfrL2vMlWE2V%2F0y%2Fr%2FY%2FlXFZiX%2FwWKzcWdclBLVqW5YjvAJ%2BIaVEbkTKIuitQKRwWVQcdidPwkj3wdjqjgVpoPSGQfoN%2BA7CUZwTblOLuGOECbrZwjOvQmPHLCd7JcHrZuKl6tsY3VYwtYmHcLubmBn6fBlefGSX52%2FvXwSv5BPl8EAF1q7vbjAOvpxgfmjxLuJRxDZs2V76S0L%2FeaqZefmen4YmU3YjwoMFrvxE4jrbEbP9TP%2FqfyQxRfvrukurgdh0AYc57YC0uHNk7ph8%2FeXr75TSA102ou4oUvvZHX29fLs9uPzZlAHbeJY6Otj4RZevrIVLOsH2VFco0XBMVRaZZ36trYBOivcDmIH07niDap1AlbPWFHQGSoAtQA6UUvBDL%2FtRJZidFisZatydINEJcgpWShW%2FpL2sqBNW3B5D40colgFSkKWsqrkddpZs3edDvkdAyVvrWHnF3LXqVikzhRrKqA2MutAKUPIF8DkHpnscP9QwbKX47abqccsIV%2BMiUzQmm2n0QLziv7iyA%2BjiTcO%2FAQIlxdEfR56OAmb6jw5p%2BfvLpAnnulmS%2F0qRNH%2Bcs49YIbZSnbVaE8B3MZpGzKauwQ%2FRwXKQobfZ%2BAzyn0fJH6Mf8Mkh79RWRyp3o3%2FAa6Gp3ZQJ29lLTWZq7aRSHyxumBct5XhFRPzll4xsurGcDMZWkpV0wF5BUS3gRKGjqLooKKaCGl4zqkl0JZ7gmHBKjAWhbSEGJ4CFfJMRnjdSGWoGEBAQL4NAHSP9nYFxZTOctkK4LKRF46TGCYUcPRiJ%2FZDkAF44%2FtiI4K2cQ%2FfHCftnHhj4sXTcDL1wv2bI8LXA11C%2FXOg%2FlsnQTAJIZcmz2RZanzIdNVheM2yW2iirWAO1wwT2E%2FFtseWTPGS78dd%2FfZ8J%2BMoxPtRzLdeEJoUn7VGwo537ja54DrDh4qG5Et1TIp32EMjSGkJ55fNaP51rmBhzE%2BF9o5nf5yjOrymc%2FaQKtH7FMfqAIMwC1bvPv10OJsPGjH%2Fht8jDOo7uMdSPdj60b08oGv3dDVeb2EURsk4nkSjUWgTlQ5Hrz%2BJN1fqYyaw9KsNmTtY5tvk8DusBg9QAZD2sMsfWajThpUOwQpWHycR1P8oDHv%2BKy6%2B7iplFITx5LQ3q%2BHGm1GVzaQqIIpHK2qrB4PqcS37pn10FlrmoIL2z2JWUkCkbKP8mPyBVYcqfN93nQDuA8AmbTIEs%2B0UYlm54Qd61%2F67PtyyCIFYsLY4sG9myA6An2E7OwDOnB1r9q1MMfv%2Fkh2sRR0oISbtYMPby%2FZ9vjHsd77Ums%2FgfoPdaOAMy%2F3%2BO8hxIK61SzDRteyAU3T%2FMCB3OyEMEL3W678A5q3yWMoRAAA%3D'
video_m3u8_url = parse_blob(url)
if video_m3u8_url:
video_path = download_video(video_m3u8_url)
if video_path:
get_video_attribute_value(video_path)
result:
fps: 24
width: 1280
height: 720
frame_counter: 4666
duration: 194.41666666666666
- JavaScript 教程
- JavaScript 编辑工具
- JavaScript 与HTML
- JavaScript 与Java
- JavaScript 数据结构
- JavaScript 基本数据类型
- JavaScript 特殊数据类型
- JavaScript 运算符
- JavaScript typeof 运算符
- JavaScript 表达式
- JavaScript 类型转换
- JavaScript 基本语法
- JavaScript 注释
- Javascript 基本处理流程
- Javascript 选择结构
- Javascript if 语句
- Javascript if 语句的嵌套
- Javascript switch 语句
- Javascript 循环结构
- Javascript 循环结构实例
- Javascript 跳转语句
- Javascript 控制语句总结
- Javascript 函数介绍
- Javascript 函数的定义
- Javascript 函数调用
- Javascript 几种特殊的函数
- JavaScript 内置函数简介
- Javascript eval() 函数
- Javascript isFinite() 函数
- Javascript isNaN() 函数
- parseInt() 与 parseFloat()
- escape() 与 unescape()
- Javascript 字符串介绍
- Javascript length属性
- javascript 字符串函数
- Javascript 日期对象简介
- Javascript 日期对象用途
- Date 对象属性和方法
- Javascript 数组是什么
- Javascript 创建数组
- Javascript 数组赋值与取值
- Javascript 数组属性和方法
- 常用功能加载宏——一个工作簿的工作表另存为工作簿
- VBA使用API_03:创建窗体
- Jackson 反序列化远程代码执行漏洞复现
- MyVBA加载宏——添加自定义菜单01
- JavaScript|计算字符串的字节数
- 7个处理JavaScript值为undefined的技巧
- Python|快速排序
- 如何判断一个网页是列表页还是详情页
- 小游戏:围住神经猫
- JAVA|Java方法的使用
- VBA解压缩ZIP文件06——Huffman树码表
- 精品连载丨安卓 App 逆向课程之五 frida 注入 Okhttp 抓包下篇
- Excel VBA常用功能加载宏——工作表隐藏
- VBE菜单——CommandBars对象
- Java|屏幕截图