[+] archive versions table, parse_archive_listing, sync_reference
1. add archive_versions table to cache_db (schema v2) with name,
version, archive_date, filename, status (pending/synced);
2. add archive_version_status_t StrEnum, archive_version_row_t model;
3. add cache_db methods: upsert/mark_synced/list/find/bulk_upsert
archive versions, has_package_version;
4. add archive_entry_t dataclass and parse_archive_listing() on
pacman_manager_t for parsing nginx autoindex HTML;
5. add sync_reference() on manager: fetches archive listing pages for
missing pinned versions, stores entries, syncs needed dates;
6. add sync_reference to abstract manager_t interface;
7. add --reference to archive sync CLI;
8. add test_archive_versions.py with HTML parsing, DB, and
sync_reference tests;
This commit is contained in:
parent
cd170d2e9e
commit
5a749b20b9
@ -46,3 +46,15 @@ class manager_t(abc.ABC):
|
||||
) -> None:
|
||||
"""Sync a range of dates."""
|
||||
...
|
||||
|
||||
@abc.abstractmethod
|
||||
def sync_reference(
|
||||
self,
|
||||
reference: dict[str, str],
|
||||
cache_dir: pathlib.Path,
|
||||
cache_db: cache_db_t,
|
||||
repos: Optional[list[str]] = None,
|
||||
arch: str = 'x86_64',
|
||||
) -> None:
|
||||
"""Fetch archive listings for pinned versions not in cache, sync needed dates."""
|
||||
...
|
||||
|
||||
@ -0,0 +1,352 @@
|
||||
import datetime
|
||||
import pathlib
|
||||
import sqlite3
|
||||
import tempfile
|
||||
import unittest
|
||||
import unittest.mock
|
||||
|
||||
from ..apps.cache.db import cache_db_t, archive_version_row_t, archive_version_status_t
|
||||
from ..apps.pacman.manager import pacman_manager_t, archive_entry_t
|
||||
|
||||
|
||||
SAMPLE_NGINX_HTML = """\
|
||||
<html>
|
||||
<head><title>Index of /packages/g/glibc/</title></head>
|
||||
<body>
|
||||
<h1>Index of /packages/g/glibc/</h1><hr><pre><a href="../">../</a>
|
||||
<a href="glibc-2.37-3-x86_64.pkg.tar.zst">glibc-2.37-3-x86_64.pkg.tar.zst</a> 15-Apr-2023 20:55 10M
|
||||
<a href="glibc-2.37-3-x86_64.pkg.tar.zst.sig">glibc-2.37-3-x86_64.pkg.tar.zst.sig</a> 15-Apr-2023 20:55 566
|
||||
<a href="glibc-2.38-7-x86_64.pkg.tar.zst">glibc-2.38-7-x86_64.pkg.tar.zst</a> 03-Dec-2023 18:33 10M
|
||||
<a href="glibc-2.38-7-x86_64.pkg.tar.zst.sig">glibc-2.38-7-x86_64.pkg.tar.zst.sig</a> 03-Dec-2023 18:33 566
|
||||
<a href="glibc-2.39-1-x86_64.pkg.tar.zst">glibc-2.39-1-x86_64.pkg.tar.zst</a> 02-Feb-2024 16:50 10M
|
||||
<a href="glibc-2.39-1-x86_64.pkg.tar.zst.sig">glibc-2.39-1-x86_64.pkg.tar.zst.sig</a> 02-Feb-2024 16:50 566
|
||||
<a href="glibc-2.41-1-x86_64.pkg.tar.zst">glibc-2.41-1-x86_64.pkg.tar.zst</a> 28-Jan-2025 03:11 11M
|
||||
<a href="glibc-2.41-1-x86_64.pkg.tar.zst.sig">glibc-2.41-1-x86_64.pkg.tar.zst.sig</a> 28-Jan-2025 03:11 566
|
||||
</pre><hr></body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
SAMPLE_NGINX_HTML_PYTHON = """\
|
||||
<html>
|
||||
<head><title>Index of /packages/p/python/</title></head>
|
||||
<body>
|
||||
<h1>Index of /packages/p/python/</h1><hr><pre><a href="../">../</a>
|
||||
<a href="python-3.11.5-1-x86_64.pkg.tar.zst">python-3.11.5-1-x86_64.pkg.tar.zst</a> 07-Aug-2023 12:00 20M
|
||||
<a href="python-3.11.5-1-x86_64.pkg.tar.zst.sig">python-3.11.5-1-x86_64.pkg.tar.zst.sig</a> 07-Aug-2023 12:00 566
|
||||
<a href="python-3.12.1-1-x86_64.pkg.tar.zst">python-3.12.1-1-x86_64.pkg.tar.zst</a> 15-Dec-2023 09:00 21M
|
||||
<a href="python-3.12.1-1-x86_64.pkg.tar.zst.sig">python-3.12.1-1-x86_64.pkg.tar.zst.sig</a> 15-Dec-2023 09:00 566
|
||||
</pre><hr></body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class TestParseArchiveHtml(unittest.TestCase):
|
||||
def test_parse_glibc_entries(self) -> None:
|
||||
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
|
||||
self.assertEqual(len(entries), 4)
|
||||
|
||||
def test_parse_excludes_sig_files(self) -> None:
|
||||
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
|
||||
for e in entries:
|
||||
self.assertFalse(e.filename.endswith('.sig'))
|
||||
|
||||
def test_parse_extracts_version(self) -> None:
|
||||
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
|
||||
versions = [e.version for e in entries]
|
||||
self.assertIn('2.37-3', versions)
|
||||
self.assertIn('2.38-7', versions)
|
||||
self.assertIn('2.39-1', versions)
|
||||
self.assertIn('2.41-1', versions)
|
||||
|
||||
def test_parse_extracts_date(self) -> None:
|
||||
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
|
||||
by_version = {e.version: e for e in entries}
|
||||
self.assertEqual(by_version['2.39-1'].date, datetime.date(2024, 2, 2))
|
||||
self.assertEqual(by_version['2.41-1'].date, datetime.date(2025, 1, 28))
|
||||
|
||||
def test_parse_extracts_filename(self) -> None:
|
||||
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
|
||||
by_version = {e.version: e for e in entries}
|
||||
self.assertEqual(
|
||||
by_version['2.39-1'].filename,
|
||||
'glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
|
||||
def test_parse_empty_html(self) -> None:
|
||||
html = '<html><body><pre></pre></body></html>'
|
||||
entries = pacman_manager_t.parse_archive_listing('glibc', html)
|
||||
self.assertEqual(entries, [])
|
||||
|
||||
def test_parse_python_package(self) -> None:
|
||||
entries = pacman_manager_t.parse_archive_listing('python', SAMPLE_NGINX_HTML_PYTHON)
|
||||
self.assertEqual(len(entries), 2)
|
||||
versions = [e.version for e in entries]
|
||||
self.assertIn('3.11.5-1', versions)
|
||||
self.assertIn('3.12.1-1', versions)
|
||||
|
||||
|
||||
class TestArchiveVersionTable(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.conn = sqlite3.connect(':memory:')
|
||||
cache_db_t.migrate(self.conn, 0, cache_db_t.schema_version())
|
||||
self.db = cache_db_t(self.conn)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.conn.close()
|
||||
|
||||
def test_upsert_archive_version(self) -> None:
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
archive_date=datetime.date(2024, 2, 2),
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
rows = list(self.db.list_archive_versions('glibc'))
|
||||
self.assertEqual(len(rows), 1)
|
||||
self.assertEqual(rows[0].version, '2.39-1')
|
||||
self.assertEqual(rows[0].status, archive_version_status_t.pending)
|
||||
|
||||
def test_upsert_archive_version_idempotent(self) -> None:
|
||||
for _ in range(3):
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
archive_date=datetime.date(2024, 2, 2),
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
rows = list(self.db.list_archive_versions('glibc'))
|
||||
self.assertEqual(len(rows), 1)
|
||||
|
||||
def test_mark_synced(self) -> None:
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
archive_date=datetime.date(2024, 2, 2),
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
self.db.mark_archive_version_synced('glibc', '2.39-1')
|
||||
rows = list(self.db.list_archive_versions('glibc'))
|
||||
self.assertEqual(rows[0].status, archive_version_status_t.synced)
|
||||
|
||||
def test_list_pending(self) -> None:
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.38-7',
|
||||
archive_date=datetime.date(2023, 12, 3),
|
||||
filename='glibc-2.38-7-x86_64.pkg.tar.zst',
|
||||
)
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
archive_date=datetime.date(2024, 2, 2),
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
self.db.mark_archive_version_synced('glibc', '2.38-7')
|
||||
|
||||
pending = list(self.db.list_pending_archive_versions())
|
||||
self.assertEqual(len(pending), 1)
|
||||
self.assertEqual(pending[0].name, 'glibc')
|
||||
self.assertEqual(pending[0].version, '2.39-1')
|
||||
|
||||
def test_find_archive_date_for_version(self) -> None:
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
archive_date=datetime.date(2024, 2, 2),
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
row = self.db.find_archive_version('glibc', '2.39-1')
|
||||
self.assertIsNotNone(row)
|
||||
self.assertEqual(row.archive_date, datetime.date(2024, 2, 2))
|
||||
|
||||
def test_find_archive_version_not_found(self) -> None:
|
||||
row = self.db.find_archive_version('glibc', '9.99-1')
|
||||
self.assertIsNone(row)
|
||||
|
||||
def test_bulk_upsert(self) -> None:
|
||||
entries = [
|
||||
archive_entry_t(
|
||||
name='glibc',
|
||||
version='2.38-7',
|
||||
filename='glibc-2.38-7-x86_64.pkg.tar.zst',
|
||||
date=datetime.date(2023, 12, 3),
|
||||
),
|
||||
archive_entry_t(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
date=datetime.date(2024, 2, 2),
|
||||
),
|
||||
]
|
||||
self.db.bulk_upsert_archive_versions(entries)
|
||||
rows = list(self.db.list_archive_versions('glibc'))
|
||||
self.assertEqual(len(rows), 2)
|
||||
|
||||
def test_pending_dates_for_reference(self) -> None:
|
||||
"""Given a reference dict, find which versions are missing from cache
|
||||
and return dates that need syncing."""
|
||||
self.db.upsert_archive_version(
|
||||
name='glibc',
|
||||
version='2.39-1',
|
||||
archive_date=datetime.date(2024, 2, 2),
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
self.db.upsert_archive_version(
|
||||
name='python',
|
||||
version='3.12.1-1',
|
||||
archive_date=datetime.date(2023, 12, 15),
|
||||
filename='python-3.12.1-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
# glibc synced, python not
|
||||
self.db.mark_archive_version_synced('glibc', '2.39-1')
|
||||
|
||||
pending = list(self.db.list_pending_archive_versions())
|
||||
dates = {r.archive_date for r in pending}
|
||||
self.assertIn(datetime.date(2023, 12, 15), dates)
|
||||
self.assertNotIn(datetime.date(2024, 2, 2), dates)
|
||||
|
||||
|
||||
class TestSyncReference(unittest.TestCase):
|
||||
"""Tests for sync_reference: fetches archive listings, populates
|
||||
archive_versions, determines dates to sync, syncs them."""
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.conn = sqlite3.connect(':memory:')
|
||||
cache_db_t.migrate(self.conn, 0, cache_db_t.schema_version())
|
||||
self.db = cache_db_t(self.conn)
|
||||
self.mgr = pacman_manager_t()
|
||||
self._tmp = tempfile.TemporaryDirectory()
|
||||
self.cache_dir = pathlib.Path(self._tmp.name)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.conn.close()
|
||||
self._tmp.cleanup()
|
||||
|
||||
def _mock_fetch_listing(self, responses: dict[str, str]) -> unittest.mock.MagicMock:
|
||||
"""Mock _fetch_archive_page to return canned HTML per package name."""
|
||||
def side_effect(pkg_name: str) -> str:
|
||||
return responses.get(pkg_name, '<html><body><pre></pre></body></html>')
|
||||
return unittest.mock.patch.object(
|
||||
self.mgr, '_fetch_archive_page', side_effect=side_effect,
|
||||
)
|
||||
|
||||
def test_sync_reference_discovers_versions(self) -> None:
|
||||
reference = {'glibc': '2.39-1', 'python': '3.12.1-1'}
|
||||
|
||||
with self._mock_fetch_listing({
|
||||
'glibc': SAMPLE_NGINX_HTML,
|
||||
'python': SAMPLE_NGINX_HTML_PYTHON,
|
||||
}):
|
||||
with unittest.mock.patch.object(self.mgr, 'sync_date'):
|
||||
self.mgr.sync_reference(
|
||||
reference=reference,
|
||||
cache_dir=self.cache_dir,
|
||||
cache_db=self.db,
|
||||
)
|
||||
|
||||
glibc_row = self.db.find_archive_version('glibc', '2.39-1')
|
||||
self.assertIsNotNone(glibc_row)
|
||||
self.assertEqual(glibc_row.archive_date, datetime.date(2024, 2, 2))
|
||||
|
||||
python_row = self.db.find_archive_version('python', '3.12.1-1')
|
||||
self.assertIsNotNone(python_row)
|
||||
self.assertEqual(python_row.archive_date, datetime.date(2023, 12, 15))
|
||||
|
||||
def test_sync_reference_calls_sync_date(self) -> None:
|
||||
reference = {'glibc': '2.39-1'}
|
||||
|
||||
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}):
|
||||
with unittest.mock.patch.object(self.mgr, 'sync_date') as mock_sync:
|
||||
self.mgr.sync_reference(
|
||||
reference=reference,
|
||||
cache_dir=self.cache_dir,
|
||||
cache_db=self.db,
|
||||
)
|
||||
|
||||
# should sync the date corresponding to glibc 2.39-1 (2024/02/02)
|
||||
self.assertGreater(mock_sync.call_count, 0)
|
||||
synced_dates = {call.kwargs.get('date') or call.args[0] for call in mock_sync.call_args_list}
|
||||
self.assertIn('2024/02/02', synced_dates)
|
||||
|
||||
def test_sync_reference_skips_already_cached(self) -> None:
|
||||
"""If a (name, version) already exists in packages table, skip it."""
|
||||
from ..apps.pacman.types import repo_index_t, package_desc_t
|
||||
|
||||
# pre-populate: glibc 2.39-1 is already in a cached snapshot
|
||||
snapshot_id = self.db.upsert_snapshot(
|
||||
date='2024/02/02',
|
||||
repo='core',
|
||||
arch='x86_64',
|
||||
db_sha256='abc123',
|
||||
)
|
||||
idx = repo_index_t(name='core')
|
||||
idx.packages['glibc'] = package_desc_t(
|
||||
name='glibc', version='2.39-1',
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
self.db.store_index(snapshot_id=snapshot_id, index=idx)
|
||||
|
||||
reference = {'glibc': '2.39-1'}
|
||||
|
||||
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}) as mock_fetch:
|
||||
with unittest.mock.patch.object(self.mgr, 'sync_date'):
|
||||
self.mgr.sync_reference(
|
||||
reference=reference,
|
||||
cache_dir=self.cache_dir,
|
||||
cache_db=self.db,
|
||||
)
|
||||
|
||||
# should not have fetched archive page since version is already in packages
|
||||
mock_fetch.assert_not_called()
|
||||
|
||||
def test_sync_reference_marks_synced(self) -> None:
|
||||
from ..apps.pacman.types import repo_index_t, package_desc_t
|
||||
|
||||
reference = {'glibc': '2.39-1'}
|
||||
|
||||
def fake_sync_date(**kwargs: object) -> None:
|
||||
# simulate what sync_date does: insert a package into the db
|
||||
snapshot_id = self.db.upsert_snapshot(
|
||||
date='2024/02/02', repo='core', arch='x86_64', db_sha256='fake',
|
||||
)
|
||||
idx = repo_index_t(name='core')
|
||||
idx.packages['glibc'] = package_desc_t(
|
||||
name='glibc', version='2.39-1',
|
||||
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
|
||||
)
|
||||
self.db.store_index(snapshot_id=snapshot_id, index=idx)
|
||||
|
||||
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}):
|
||||
with unittest.mock.patch.object(self.mgr, 'sync_date', side_effect=fake_sync_date):
|
||||
self.mgr.sync_reference(
|
||||
reference=reference,
|
||||
cache_dir=self.cache_dir,
|
||||
cache_db=self.db,
|
||||
)
|
||||
|
||||
row = self.db.find_archive_version('glibc', '2.39-1')
|
||||
self.assertIsNotNone(row)
|
||||
self.assertEqual(row.status, archive_version_status_t.synced)
|
||||
|
||||
def test_sync_reference_version_not_in_archive(self) -> None:
|
||||
"""If the version isn't found in archive listing, log warning, no crash."""
|
||||
reference = {'glibc': '9.99.99-1'}
|
||||
|
||||
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}):
|
||||
with unittest.mock.patch.object(self.mgr, 'sync_date') as mock_sync:
|
||||
self.mgr.sync_reference(
|
||||
reference=reference,
|
||||
cache_dir=self.cache_dir,
|
||||
cache_db=self.db,
|
||||
)
|
||||
|
||||
mock_sync.assert_not_called()
|
||||
|
||||
def test_sync_reference_empty(self) -> None:
|
||||
with unittest.mock.patch.object(self.mgr, 'sync_date') as mock_sync:
|
||||
self.mgr.sync_reference(
|
||||
reference={},
|
||||
cache_dir=self.cache_dir,
|
||||
cache_db=self.db,
|
||||
)
|
||||
mock_sync.assert_not_called()
|
||||
Loading…
Reference in New Issue
Block a user