Python selenium chrome parses blob url

时间:2022-07-25
本文章向大家介绍Python selenium chrome parses blob url,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

When crawling video sites, many videos use blob url to hide the source address. Through F12, you can find the download address of m3u8 corresponding to the request, but the amount of crawled data is very large. Use f12 to analyze each one. Obviously impossible, this article will parse the blob URL through selenium chrome to get the source m3u8 address

Example in this article: twitter preview link.

url : https://ton.twimg.com/ads-manager/tweet-preview/index.html?data=H4sIAAAAAAAAAO1YbW%2FbNhD%2BK4T2caotWbIkG9iHNE3TdV3Wpi3QFxUCLVE2U4nUSMqJE%2Fi%2F747yixI76AYUxQYsQGLzeHc8Hu8ePsydY64ZM86U3Dm5YtSwIqMwdJ4rTi7kkoxC4o2mgTf1RuRnD37IyPNjx3V44UwnQZAk42DiJ9E4jsZJMkF5po0CD4eTYFW2VZUZdoNLnFWEpq3nlb4kDdW0kC6pUcB8TQpGJrBYzSsC2oqKK6akdkmj5JIJzoRhVikOwNYaFZrBPKjO6BX8CsIE%2Bel0wQV1SSsoyXmpKBEtWzKyZDmY13QlFfmzZWRBc0YCbxePHpCFMY2epsN0aAa5TIfq1%2BfFb6eX6o8AtmFUK3JMljMtaaWZ6xRcNxVd2b1lEO6cOdPPnusH8RfXgWi54Uw70ztnQfXC0Dl8%2F3znbDJho8ScioLnqPY5Cd2J92UNtnpVz2SFMhi0mqmsRndSbEVqO1mzglPrdnc23siPx%2BMkDIJ4Mnp4Nvcm763uB4nrR%2F7WZwZrgBFmxCakmemBueb1HBJTp0Orkw6fvf5wmbHo%2FdnJSfDk03hw1cydnoPMJnTjRv9jP%2FsQHj2U7RF0qg3P0bkxTHXu7%2Bmym4aKAor9wG%2FfJJ8bwQTToCurdKgNNa1Oh4eVnQ6XvGAQkI%2FFsWoYBrCQRsJQ89vu5M2irWf45dqZ%2BmPPdRabT8VQB0xyJRtn7ToVVVg%2FVjMKO80g6muW3KAipqyt%2F46mrmlVfVsRNEtAgVbZkNfrL2vMlWE2V%2F0y%2Fr%2FY%2FlXFZiX%2FwWKzcWdclBLVqW5YjvAJ%2BIaVEbkTKIuitQKRwWVQcdidPwkj3wdjqjgVpoPSGQfoN%2BA7CUZwTblOLuGOECbrZwjOvQmPHLCd7JcHrZuKl6tsY3VYwtYmHcLubmBn6fBlefGSX52%2FvXwSv5BPl8EAF1q7vbjAOvpxgfmjxLuJRxDZs2V76S0L%2FeaqZefmen4YmU3YjwoMFrvxE4jrbEbP9TP%2FqfyQxRfvrukurgdh0AYc57YC0uHNk7ph8%2FeXr75TSA102ou4oUvvZHX29fLs9uPzZlAHbeJY6Otj4RZevrIVLOsH2VFco0XBMVRaZZ36trYBOivcDmIH07niDap1AlbPWFHQGSoAtQA6UUvBDL%2FtRJZidFisZatydINEJcgpWShW%2FpL2sqBNW3B5D40colgFSkKWsqrkddpZs3edDvkdAyVvrWHnF3LXqVikzhRrKqA2MutAKUPIF8DkHpnscP9QwbKX47abqccsIV%2BMiUzQmm2n0QLziv7iyA%2BjiTcO%2FAQIlxdEfR56OAmb6jw5p%2BfvLpAnnulmS%2F0qRNH%2Bcs49YIbZSnbVaE8B3MZpGzKauwQ%2FRwXKQobfZ%2BAzyn0fJH6Mf8Mkh79RWRyp3o3%2FAa6Gp3ZQJ29lLTWZq7aRSHyxumBct5XhFRPzll4xsurGcDMZWkpV0wF5BUS3gRKGjqLooKKaCGl4zqkl0JZ7gmHBKjAWhbSEGJ4CFfJMRnjdSGWoGEBAQL4NAHSP9nYFxZTOctkK4LKRF46TGCYUcPRiJ%2FZDkAF44%2FtiI4K2cQ%2FfHCftnHhj4sXTcDL1wv2bI8LXA11C%2FXOg%2FlsnQTAJIZcmz2RZanzIdNVheM2yW2iirWAO1wwT2E%2FFtseWTPGS78dd%2FfZ8J%2BMoxPtRzLdeEJoUn7VGwo537ja54DrDh4qG5Et1TIp32EMjSGkJ55fNaP51rmBhzE%2BF9o5nf5yjOrymc%2FaQKtH7FMfqAIMwC1bvPv10OJsPGjH%2Fht8jDOo7uMdSPdj60b08oGv3dDVeb2EURsk4nkSjUWgTlQ5Hrz%2BJN1fqYyaw9KsNmTtY5tvk8DusBg9QAZD2sMsfWajThpUOwQpWHycR1P8oDHv%2BKy6%2B7iplFITx5LQ3q%2BHGm1GVzaQqIIpHK2qrB4PqcS37pn10FlrmoIL2z2JWUkCkbKP8mPyBVYcqfN93nQDuA8AmbTIEs%2B0UYlm54Qd61%2F67PtyyCIFYsLY4sG9myA6An2E7OwDOnB1r9q1MMfv%2Fkh2sRR0oISbtYMPby%2FZ9vjHsd77Ums%2FgfoPdaOAMy%2F3%2BO8hxIK61SzDRteyAU3T%2FMCB3OyEMEL3W678A5q3yWMoRAAA%3D

Selenium intercepts all requests, which is equivalent to the network all function in F12

Parse the request log, get the m3u8 address, download the video through ffmpy3, and finally parse the video through opencv to get the attribute value of the video

pip install selenium ffmpy3 opencv-python

code:

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
import re
from selenium.webdriver.chrome.options import Options
from ffmpy3 import FFmpeg
import cv2


def download_video(video_m3u8_url):
    try:
        outputs_path = r'/data/test.mp4'
        a = FFmpeg(inputs={video_m3u8_url: None}, outputs={outputs_path: None})
        a.run()
        return outputs_path
    except Exception as e:
        print(e)


def parse_blob(url):
    chromeOpitons = Options()
    prefs = {
        "profile.managed_default_content_settings.images": 1,
        "profile.content_settings.plugin_whitelist.adobe-flash-player": 1,
        "profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player": 1,
    }
    chromeOpitons.add_experimental_option('prefs', prefs)
    d = DesiredCapabilities.CHROME
    d['goog:loggingPrefs'] = {'performance': 'ALL'}
    chromeOpitons.add_experimental_option('w3c', False)
    browser = webdriver.Chrome(desired_capabilities=d, chrome_options=chromeOpitons)
    try:
        browser.get(url)
        browser.implicitly_wait(30)
        performance_log = browser.get_log('performance')
        for i in performance_log:
            response = json.loads(i.get('message'))
            if response.get('message').get('method') == 'Network.requestWillBeSentExtraInfo':
                message = eval(i.get('message'))
                headers = message.get('message').get('params').get('headers')
                path = headers.get(':path')
                if path:
                    find_content = re.findall(r'.m3u8', path)
                    if find_content:
                        authority = headers.get(':authority')
                        method = headers.get(':method')
                        scheme = headers.get(':scheme')
                        video_m3u8_url = scheme + '://' + authority + path
                        return video_m3u8_url
    except Exception as e:
        print(e)
    finally:
        browser.quit()


def get_video_attribute_value(video_path):
    try:
        cap = cv2.VideoCapture(video_path)
        fps = int(round(cap.get(cv2.CAP_PROP_FPS)))
        print('fps:', fps)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        print('width:', width)
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        print('height:', height)
        frame_counter = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()
        cv2.destroyAllWindows()
        print('frame_counter:', frame_counter)
        duration = frame_counter / fps
        print('duration:', duration)
    except Exception as e:
        print(e)


if __name__ == '__main__':
    url = 'https://ton.twimg.com/ads-manager/tweet-preview/index.html?data=H4sIAAAAAAAAAO1YbW%2FbNhD%2BK4T2caotWbIkG9iHNE3TdV3Wpi3QFxUCLVE2U4nUSMqJE%2Fi%2F747yixI76AYUxQYsQGLzeHc8Hu8ePsydY64ZM86U3Dm5YtSwIqMwdJ4rTi7kkoxC4o2mgTf1RuRnD37IyPNjx3V44UwnQZAk42DiJ9E4jsZJMkF5po0CD4eTYFW2VZUZdoNLnFWEpq3nlb4kDdW0kC6pUcB8TQpGJrBYzSsC2oqKK6akdkmj5JIJzoRhVikOwNYaFZrBPKjO6BX8CsIE%2Bel0wQV1SSsoyXmpKBEtWzKyZDmY13QlFfmzZWRBc0YCbxePHpCFMY2epsN0aAa5TIfq1%2BfFb6eX6o8AtmFUK3JMljMtaaWZ6xRcNxVd2b1lEO6cOdPPnusH8RfXgWi54Uw70ztnQfXC0Dl8%2F3znbDJho8ScioLnqPY5Cd2J92UNtnpVz2SFMhi0mqmsRndSbEVqO1mzglPrdnc23siPx%2BMkDIJ4Mnp4Nvcm763uB4nrR%2F7WZwZrgBFmxCakmemBueb1HBJTp0Orkw6fvf5wmbHo%2FdnJSfDk03hw1cydnoPMJnTjRv9jP%2FsQHj2U7RF0qg3P0bkxTHXu7%2Bmym4aKAor9wG%2FfJJ8bwQTToCurdKgNNa1Oh4eVnQ6XvGAQkI%2FFsWoYBrCQRsJQ89vu5M2irWf45dqZ%2BmPPdRabT8VQB0xyJRtn7ToVVVg%2FVjMKO80g6muW3KAipqyt%2F46mrmlVfVsRNEtAgVbZkNfrL2vMlWE2V%2F0y%2Fr%2FY%2FlXFZiX%2FwWKzcWdclBLVqW5YjvAJ%2BIaVEbkTKIuitQKRwWVQcdidPwkj3wdjqjgVpoPSGQfoN%2BA7CUZwTblOLuGOECbrZwjOvQmPHLCd7JcHrZuKl6tsY3VYwtYmHcLubmBn6fBlefGSX52%2FvXwSv5BPl8EAF1q7vbjAOvpxgfmjxLuJRxDZs2V76S0L%2FeaqZefmen4YmU3YjwoMFrvxE4jrbEbP9TP%2FqfyQxRfvrukurgdh0AYc57YC0uHNk7ph8%2FeXr75TSA102ou4oUvvZHX29fLs9uPzZlAHbeJY6Otj4RZevrIVLOsH2VFco0XBMVRaZZ36trYBOivcDmIH07niDap1AlbPWFHQGSoAtQA6UUvBDL%2FtRJZidFisZatydINEJcgpWShW%2FpL2sqBNW3B5D40colgFSkKWsqrkddpZs3edDvkdAyVvrWHnF3LXqVikzhRrKqA2MutAKUPIF8DkHpnscP9QwbKX47abqccsIV%2BMiUzQmm2n0QLziv7iyA%2BjiTcO%2FAQIlxdEfR56OAmb6jw5p%2BfvLpAnnulmS%2F0qRNH%2Bcs49YIbZSnbVaE8B3MZpGzKauwQ%2FRwXKQobfZ%2BAzyn0fJH6Mf8Mkh79RWRyp3o3%2FAa6Gp3ZQJ29lLTWZq7aRSHyxumBct5XhFRPzll4xsurGcDMZWkpV0wF5BUS3gRKGjqLooKKaCGl4zqkl0JZ7gmHBKjAWhbSEGJ4CFfJMRnjdSGWoGEBAQL4NAHSP9nYFxZTOctkK4LKRF46TGCYUcPRiJ%2FZDkAF44%2FtiI4K2cQ%2FfHCftnHhj4sXTcDL1wv2bI8LXA11C%2FXOg%2FlsnQTAJIZcmz2RZanzIdNVheM2yW2iirWAO1wwT2E%2FFtseWTPGS78dd%2FfZ8J%2BMoxPtRzLdeEJoUn7VGwo537ja54DrDh4qG5Et1TIp32EMjSGkJ55fNaP51rmBhzE%2BF9o5nf5yjOrymc%2FaQKtH7FMfqAIMwC1bvPv10OJsPGjH%2Fht8jDOo7uMdSPdj60b08oGv3dDVeb2EURsk4nkSjUWgTlQ5Hrz%2BJN1fqYyaw9KsNmTtY5tvk8DusBg9QAZD2sMsfWajThpUOwQpWHycR1P8oDHv%2BKy6%2B7iplFITx5LQ3q%2BHGm1GVzaQqIIpHK2qrB4PqcS37pn10FlrmoIL2z2JWUkCkbKP8mPyBVYcqfN93nQDuA8AmbTIEs%2B0UYlm54Qd61%2F67PtyyCIFYsLY4sG9myA6An2E7OwDOnB1r9q1MMfv%2Fkh2sRR0oISbtYMPby%2FZ9vjHsd77Ums%2FgfoPdaOAMy%2F3%2BO8hxIK61SzDRteyAU3T%2FMCB3OyEMEL3W678A5q3yWMoRAAA%3D'
    video_m3u8_url = parse_blob(url)
    if video_m3u8_url:
        video_path = download_video(video_m3u8_url)
        if video_path:
            get_video_attribute_value(video_path)

result:

fps: 24

width: 1280

height: 720

frame_counter: 4666

duration: 194.41666666666666