[+] partially add worker based on huggingface

1. test huggingface manually from IPython; 1.1. seems to work; 1.2 put some long text from documentation, it has provided some summary; 1.3. runs in about 40-60 seconds on 3-4 cores CPU; 1.4. almost made reuse local cache, for some reason huggingface still can download LFS weights from the public repo; 2. partially wrapped into worker which is to be run as a separate service in docker-compose;
2025-07-23 11:10:13 +03:00 · 2025-07-23 11:10:13 +03:00 · c4eb8b5568
commit c4eb8b5568
parent 0b5971c4af
1 changed files with 47 additions and 0 deletions
--- a/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
+++ b/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
@ -0,0 +1,47 @@
 import transformers
 import transformers.pipelines
 from typing import (Any, cast, Callable, Protocol, Literal,)
 class SummarizerPipeline(Protocol):
 	def predict(data: str) -> str: ...
 class Pipeline(Protocol):
 	def __call__(
 		self,
 		task: Literal['summarizer'],
 		model: Any,
 		tokenizer: Any,
 	) -> Summarizer: ...
 class Summarizer:
 	def __init__(self) -> None:
 		self.model = cast(
 			Callable[[str], Any],
 			getattr(transformers.AutoTokenizer, 'from_pretrained')(
 				'sshleifer/distilbart-cnn-12-6',
 			)
 		)
 		self.tokenizer = cast(
 			Callable[[str], Any],
 			getattr(transformers.AutoModelForSeq2SeqLM, 'from_pretrained')(
 				'sshleifer/distilbart-cnn-12-6',
 			)
 		)
 		self.summarizer = cast(
 			Pipeline,
 			getattr(transformers.pipelines, 'pipeline')(
 				'summarization',
 				model=model,
 				tokenizer=tokenizer,
 			)
 		)
 	def summarize(
 		self,
 		data: list[str]
 	) -> list[str]:
 		return self.summarizer.predict(
 			' '.join(data)
 		).split()