diff --git a/python/tasks/tiktok/__init__.py b/python/tasks/tiktok/__init__.py index 63ca13e..3f55ee8 100644 --- a/python/tasks/tiktok/__init__.py +++ b/python/tasks/tiktok/__init__.py @@ -1,5 +1,7 @@ import logging +import enum import dataclasses +import multiprocessing import traceback import subprocess import os @@ -27,7 +29,7 @@ def tiktok_config() -> tiktok_config_t.res_t: res = tiktok_config_t.res_t( project_root=os.path.abspath( os.path.join( - __file__, + os.path.dirname(__file__), '..', '..', '..', ), ), @@ -125,19 +127,93 @@ def tiktok_videos_meta(links: Iterable[str]) -> Iterable[dict[str, Any]]: url=o, id=int(parts[-1]), fname='_'.join(parts[-3:]) +'.mp4', + result_dir=tiktok_config().videos, )) return res +class tiktok_video_fetch_t: + class method_t(enum.Enum): + pyktok = 'pyktok' + tikcdn_io_curl = 'tikcdn.io-curl' + tikcdn_io_wget = 'tikcdn.io-wget' + +def tiktok_video_fetch( + id: int, + url: str, + fname: str, + result_dir: str, + method: Optional[tiktok_video_fetch_t.method_t]=None, + method_str: Optional[str]=None, +) -> None: + os.chdir(result_dir) + + if not method_str is None: + method = tiktok_video_fetch_t.method_t(method_str) + + if method is None: + method = tiktok_video_fetch_t.method_t.tikcdn_io_curl + + if method == tiktok_video_fetch_t.method_t.pyktok: + pyktok.save_tiktok(url) + elif method == tiktok_video_fetch_t.method_t.tikcdn_io_curl: + subprocess.check_call([ + 'curl', + '-v', + 'https://tikcdn.io/ssstik/%d' % id, + '-o', fname, + ]) + elif method == tiktok_video_fetch_t.method_t.tikcdn_io_wget: + subprocess.check_call([ + 'wget', + 'https://tikcdn.io/ssstik/%d' % id, + '-O', + fname, + ]) + else: + raise NotImplementedError + + mime_type = file_mime_type(fname) + + if mime_type in ['empty']: + raise RuntimeError('notdownloaded') + +def file_mime_type(path: str) -> Optional[str]: + if os.path.exists(path): + mime_type = subprocess.check_output([ + 'file', + '-b', path, + ]).strip().decode('utf-8') + return mime_type + else: + return None + +async def playwright_save(url: str): + import TikTokApi + + async with TikTokApi.TikTokApi() as client: + await client.create_sessions() + session = client.sessions[0] + page = session.page + + async with page.expect_download() as download_info: + await page.goto(url) + download = download_info.value + path = download.path() + download.save_as(path) + print(path) + def tiktok_videos_fetch( meta: Iterable[dict[str, Any]], - method: Optional[Literal['pyktok', 'tikcdn.io']]=None, + method: Optional[tiktok_video_fetch_t.method_t]=None, + method_str: Optional[str]=None, + force: Optional[bool]=None, ) -> Iterable[dict[str, Any]]: import pyktok import tqdm - if method is None: - method = 'pyktok' + if force is None: + force = False stats = dict( saved=0, @@ -146,25 +222,40 @@ def tiktok_videos_fetch( error=0, ) - for o in tqdm.tqdm(meta): - stats['total'] += 1 - if not os.path.exists(o['fname']): - try: - if method == 'pyktok': - pyktok.save_tiktok(o['url']) - elif method == 'tikcdn.io': - subprocess.check_call([ - 'curl', - 'https://tikcdn.io/ssstik/%d' % o['id'], - '-o', o['fname'], - ]) - stats['saved'] += 1 - except: - logger.error(json.dumps(dict( - msg=traceback.format_exc(), - ))) - stats['error'] += 1 - else: - stats['skipped'] += 1 + with multiprocessing.Pool(processes=1) as pool: + for o in tqdm.tqdm(meta): + stats['total'] += 1 + path = os.path.join( + o['result_dir'], + o['fname'], + ) + + if ( + not os.path.exists(path) or + file_mime_type(path) in ['empty'] or + force + ): + try: + pool.apply( + tiktok_video_fetch, + kwds=dict( + id=o['id'], + url=o['url'], + fname=o['fname'], + method=method, + method_str=method_str, + result_dir=o['result_dir'], + ), + ) + stats['saved'] += 1 + except KeyboardInterrupt: + break + except: + logger.error(json.dumps(dict( + msg=traceback.format_exc(), + ))) + stats['error'] += 1 + else: + stats['skipped'] += 1 return stats