[+] partially add worker based on huggingface
1. test huggingface manually from IPython; 1.1. seems to work; 1.2 put some long text from documentation, it has provided some summary; 1.3. runs in about 40-60 seconds on 3-4 cores CPU; 1.4. almost made reuse local cache, for some reason huggingface still can download LFS weights from the public repo; 2. partially wrapped into worker which is to be run as a separate service in docker-compose;
This commit is contained in:
parent
0b5971c4af
commit
c4eb8b5568
47
deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
vendored
Normal file
47
deps/test-task-2025-07-17-v2/python/online/fxreader/pr34/test_task_2025_07_17_v2/transform/worker.py
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
import transformers
|
||||
import transformers.pipelines
|
||||
|
||||
from typing import (Any, cast, Callable, Protocol, Literal,)
|
||||
|
||||
class SummarizerPipeline(Protocol):
|
||||
def predict(data: str) -> str: ...
|
||||
|
||||
class Pipeline(Protocol):
|
||||
def __call__(
|
||||
self,
|
||||
task: Literal['summarizer'],
|
||||
model: Any,
|
||||
tokenizer: Any,
|
||||
) -> Summarizer: ...
|
||||
|
||||
class Summarizer:
|
||||
def __init__(self) -> None:
|
||||
self.model = cast(
|
||||
Callable[[str], Any],
|
||||
getattr(transformers.AutoTokenizer, 'from_pretrained')(
|
||||
'sshleifer/distilbart-cnn-12-6',
|
||||
)
|
||||
)
|
||||
self.tokenizer = cast(
|
||||
Callable[[str], Any],
|
||||
getattr(transformers.AutoModelForSeq2SeqLM, 'from_pretrained')(
|
||||
'sshleifer/distilbart-cnn-12-6',
|
||||
)
|
||||
)
|
||||
|
||||
self.summarizer = cast(
|
||||
Pipeline,
|
||||
getattr(transformers.pipelines, 'pipeline')(
|
||||
'summarization',
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
)
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
data: list[str]
|
||||
) -> list[str]:
|
||||
return self.summarizer.predict(
|
||||
' '.join(data)
|
||||
).split()
|
Loading…
Reference in New Issue
Block a user