[+] partially add worker based on huggingface

1. test huggingface manually from IPython; 1.1. seems to work; 1.2 put some long text from documentation, it has provided some summary; 1.3. runs in about 40-60 seconds on 3-4 cores CPU; 1.4. almost made reuse local cache, for some reason huggingface still can download LFS weights from the public repo; 2. partially wrapped into worker which is to be run as a separate service in docker-compose;
2025-07-23 11:10:13 +03:00 · 2025-07-23 11:10:13 +03:00 · c4eb8b5568
commit c4eb8b5568
parent 0b5971c4af
1 changed files with 47 additions and 0 deletions
--- a/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
+++ b/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
@ -0,0 +1,47 @@
+import transformers
+import transformers.pipelines
+
+from typing import (Any, cast, Callable, Protocol, Literal,)
+
+class SummarizerPipeline(Protocol):
+	def predict(data: str) -> str: ...
+
+class Pipeline(Protocol):
+	def __call__(
+		self,
+		task: Literal['summarizer'],
+		model: Any,
+		tokenizer: Any,
+	) -> Summarizer: ...
+
+class Summarizer:
+	def __init__(self) -> None:
+		self.model = cast(
+			Callable[[str], Any],
+			getattr(transformers.AutoTokenizer, 'from_pretrained')(
+				'sshleifer/distilbart-cnn-12-6',
+			)
+		)
+		self.tokenizer = cast(
+			Callable[[str], Any],
+			getattr(transformers.AutoModelForSeq2SeqLM, 'from_pretrained')(
+				'sshleifer/distilbart-cnn-12-6',
+			)
+		)
+
+		self.summarizer = cast(
+			Pipeline,
+			getattr(transformers.pipelines, 'pipeline')(
+				'summarization',
+				model=model,
+				tokenizer=tokenizer,
+			)
+		)
+
+	def summarize(
+		self,
+		data: list[str]
+	) -> list[str]:
+		return self.summarizer.predict(
+			' '.join(data)
+		).split()