import re
import os
import time
import requests
from contextlib import closing
class KeepReptile(object):
def __init__(self, url, headers, keep_suburl, keep_video_suburl):
self.url = url
self.headers = headers
self.keep_suburl = keep_suburl
self.keep_video_suburl = keep_video_suburl
def get_download_video(self):
download_video = {}
html_text = requests.get(self.url, headers=self.headers).text
classification_urls = re.findall('/workouthashtags/w*', html_text, re.S)
# video_ids = re.findall('(<a href="/workouthashtags/)^[a-z][0-9]$', html_text, re.S)
#分类url
for classification_url in classification_urls:
new_classification_url = self.keep_suburl + classification_url
html_text = requests.get(new_classification_url, headers=self.headers).text
subclassification_urls = re.findall('/plans/w*', html_text, re.S)
#子分类url
for subclassification_url in subclassification_urls:
new_subclassification_url = self.keep_suburl + subclassification_url
html_text = requests.get(new_subclassification_url, headers=self.headers).text
action_urls = re.findall('/exercises/w*', html_text, re.S)
#动作分类url
for action_url in action_urls:
new_action_url = self.keep_suburl + action_url
html_text = requests.get(new_action_url, headers=self.headers).text
video_urls = re.findall('/chaos/w*/w*.mp4', html_text, re.S)
actions_names = re.findall('<h2 class="name">(.*?)</h2>', html_text, re.S)
#video url
for action_name, video_url in zip(actions_names, video_urls):
final_video_url = self.keep_video_suburl + video_url
print(action_name, final_video_url)
download_video[action_name] = final_video_url
# download_url.append(final_video_url)
return download_video
def download_video(self, video_path, action_name, video):
video_name = video_path + action_name + '.mp4'
print('video_name:', video_name)
with closing(requests.get(video, headers=self.headers, stream=True, verify=False)) as res:
chunk_size = 1024*10
content_size = int(res.headers['content-length'])
if os.path.exists(video_name) and os.path.getsize(video_name)>=content_size:
print('已下载')
return
if res.status_code == 200:
print('开始下载')
with open(video_name, "wb") as f:
p = ProgressData(size = content_size, unit='Kb', block=chunk_size, file_name=video_name)
for chunk in res.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
p.output()
class ProgressData(object):
def __init__(self, block,size, unit, file_name='', ):
self.file_name = file_name
self.block = block/1000.0
self.size = size/1000.0
self.unit = unit
self.count = 0
self.start = time.time()
def output(self):
self.end = time.time()
self.count += 1
speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
self.start = time.time()
loaded = self.count*self.block
progress = round(loaded/self.size, 4)
if loaded >= self.size:
print('%s下载完成 '%self.file_name)
else:
print('{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s'.format(self.file_name, loaded, self.unit, self.size, self.unit, progress, speed, self.unit))
print('%50s'%('/'*int((1-progress)*50)))
def main():
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
url = 'https://www.gotokeep.com/training'
keep_suburl = 'https://www.gotokeep.com'
keep_video_suburl = 'https://static1.keepcdn.com'
video_path = 'G:/zh/zh/reptile_video/keep_video/'
k = KeepReptile(url, headers, keep_suburl, keep_video_suburl)
video_disc = k.get_download_video()
for action_name, video in video_disc.items():
k.download_video(video_path ,action_name, video)
if __name__ == "__main__":