From c4eb8b556818a950233ef16117aded714056bb82 Mon Sep 17 00:00:00 2001
From: Siarhei Siniak <siarheisiniak@gmail.com>
Date: Wed, 23 Jul 2025 11:10:13 +0300
Subject: [PATCH] [+] partially add worker based on huggingface

  1. test huggingface manually from IPython;
  1.1. seems to work;
  1.2 put some long text from documentation,
    it has provided some summary;
  1.3. runs in about 40-60 seconds on 3-4 cores CPU;
  1.4. almost made reuse local cache,
    for some reason huggingface still can
    download LFS weights from the public repo;
  2. partially wrapped into worker
    which is to be run as a separate service in docker-compose;
---
 .../transform/worker.py                       | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py

diff --git a/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py b/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
new file mode 100644
index 0000000..821e222
--- /dev/null
+++ b/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
@@ -0,0 +1,47 @@
+import transformers
+import transformers.pipelines
+
+from typing import (Any, cast, Callable, Protocol, Literal,)
+
+class SummarizerPipeline(Protocol):
+	def predict(data: str) -> str: ...
+
+class Pipeline(Protocol):
+	def __call__(
+		self,
+		task: Literal['summarizer'],
+		model: Any,
+		tokenizer: Any,
+	) -> Summarizer: ...
+
+class Summarizer:
+	def __init__(self) -> None:
+		self.model = cast(
+			Callable[[str], Any],
+			getattr(transformers.AutoTokenizer, 'from_pretrained')(
+				'sshleifer/distilbart-cnn-12-6',
+			)
+		)
+		self.tokenizer = cast(
+			Callable[[str], Any],
+			getattr(transformers.AutoModelForSeq2SeqLM, 'from_pretrained')(
+				'sshleifer/distilbart-cnn-12-6',
+			)
+		)
+
+		self.summarizer = cast(
+			Pipeline,
+			getattr(transformers.pipelines, 'pipeline')(
+				'summarization',
+				model=model,
+				tokenizer=tokenizer,
+			)
+		)
+
+	def summarize(
+		self,
+		data: list[str]
+	) -> list[str]:
+		return self.summarizer.predict(
+			' '.join(data)
+		).split()