From c4eb8b556818a950233ef16117aded714056bb82 Mon Sep 17 00:00:00 2001 From: Siarhei Siniak Date: Wed, 23 Jul 2025 11:10:13 +0300 Subject: [PATCH] [+] partially add worker based on huggingface 1. test huggingface manually from IPython; 1.1. seems to work; 1.2 put some long text from documentation, it has provided some summary; 1.3. runs in about 40-60 seconds on 3-4 cores CPU; 1.4. almost made reuse local cache, for some reason huggingface still can download LFS weights from the public repo; 2. partially wrapped into worker which is to be run as a separate service in docker-compose; --- .../transform/worker.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py diff --git a/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py b/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py new file mode 100644 index 0000000..821e222 --- /dev/null +++ b/deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py @@ -0,0 +1,47 @@ +import transformers +import transformers.pipelines + +from typing import (Any, cast, Callable, Protocol, Literal,) + +class SummarizerPipeline(Protocol): + def predict(data: str) -> str: ... + +class Pipeline(Protocol): + def __call__( + self, + task: Literal['summarizer'], + model: Any, + tokenizer: Any, + ) -> Summarizer: ... + +class Summarizer: + def __init__(self) -> None: + self.model = cast( + Callable[[str], Any], + getattr(transformers.AutoTokenizer, 'from_pretrained')( + 'sshleifer/distilbart-cnn-12-6', + ) + ) + self.tokenizer = cast( + Callable[[str], Any], + getattr(transformers.AutoModelForSeq2SeqLM, 'from_pretrained')( + 'sshleifer/distilbart-cnn-12-6', + ) + ) + + self.summarizer = cast( + Pipeline, + getattr(transformers.pipelines, 'pipeline')( + 'summarization', + model=model, + tokenizer=tokenizer, + ) + ) + + def summarize( + self, + data: list[str] + ) -> list[str]: + return self.summarizer.predict( + ' '.join(data) + ).split()