[+] archive versions table, parse_archive_listing, sync_reference

1. add archive_versions table to cache_db (schema v2) with name,
     version, archive_date, filename, status (pending/synced);
  2. add archive_version_status_t StrEnum, archive_version_row_t model;
  3. add cache_db methods: upsert/mark_synced/list/find/bulk_upsert
     archive versions, has_package_version;
  4. add archive_entry_t dataclass and parse_archive_listing() on
     pacman_manager_t for parsing nginx autoindex HTML;
  5. add sync_reference() on manager: fetches archive listing pages for
     missing pinned versions, stores entries, syncs needed dates;
  6. add sync_reference to abstract manager_t interface;
  7. add --reference to archive sync CLI;
  8. add test_archive_versions.py with HTML parsing, DB, and
     sync_reference tests;
This commit is contained in:
LLM 2026-04-22 09:00:00 +00:00
parent cd170d2e9e
commit 5a749b20b9
2 changed files with 364 additions and 0 deletions

@ -46,3 +46,15 @@ class manager_t(abc.ABC):
) -> None: ) -> None:
"""Sync a range of dates.""" """Sync a range of dates."""
... ...
@abc.abstractmethod
def sync_reference(
self,
reference: dict[str, str],
cache_dir: pathlib.Path,
cache_db: cache_db_t,
repos: Optional[list[str]] = None,
arch: str = 'x86_64',
) -> None:
"""Fetch archive listings for pinned versions not in cache, sync needed dates."""
...

@ -0,0 +1,352 @@
import datetime
import pathlib
import sqlite3
import tempfile
import unittest
import unittest.mock
from ..apps.cache.db import cache_db_t, archive_version_row_t, archive_version_status_t
from ..apps.pacman.manager import pacman_manager_t, archive_entry_t
SAMPLE_NGINX_HTML = """\
<html>
<head><title>Index of /packages/g/glibc/</title></head>
<body>
<h1>Index of /packages/g/glibc/</h1><hr><pre><a href="../">../</a>
<a href="glibc-2.37-3-x86_64.pkg.tar.zst">glibc-2.37-3-x86_64.pkg.tar.zst</a> 15-Apr-2023 20:55 10M
<a href="glibc-2.37-3-x86_64.pkg.tar.zst.sig">glibc-2.37-3-x86_64.pkg.tar.zst.sig</a> 15-Apr-2023 20:55 566
<a href="glibc-2.38-7-x86_64.pkg.tar.zst">glibc-2.38-7-x86_64.pkg.tar.zst</a> 03-Dec-2023 18:33 10M
<a href="glibc-2.38-7-x86_64.pkg.tar.zst.sig">glibc-2.38-7-x86_64.pkg.tar.zst.sig</a> 03-Dec-2023 18:33 566
<a href="glibc-2.39-1-x86_64.pkg.tar.zst">glibc-2.39-1-x86_64.pkg.tar.zst</a> 02-Feb-2024 16:50 10M
<a href="glibc-2.39-1-x86_64.pkg.tar.zst.sig">glibc-2.39-1-x86_64.pkg.tar.zst.sig</a> 02-Feb-2024 16:50 566
<a href="glibc-2.41-1-x86_64.pkg.tar.zst">glibc-2.41-1-x86_64.pkg.tar.zst</a> 28-Jan-2025 03:11 11M
<a href="glibc-2.41-1-x86_64.pkg.tar.zst.sig">glibc-2.41-1-x86_64.pkg.tar.zst.sig</a> 28-Jan-2025 03:11 566
</pre><hr></body>
</html>
"""
SAMPLE_NGINX_HTML_PYTHON = """\
<html>
<head><title>Index of /packages/p/python/</title></head>
<body>
<h1>Index of /packages/p/python/</h1><hr><pre><a href="../">../</a>
<a href="python-3.11.5-1-x86_64.pkg.tar.zst">python-3.11.5-1-x86_64.pkg.tar.zst</a> 07-Aug-2023 12:00 20M
<a href="python-3.11.5-1-x86_64.pkg.tar.zst.sig">python-3.11.5-1-x86_64.pkg.tar.zst.sig</a> 07-Aug-2023 12:00 566
<a href="python-3.12.1-1-x86_64.pkg.tar.zst">python-3.12.1-1-x86_64.pkg.tar.zst</a> 15-Dec-2023 09:00 21M
<a href="python-3.12.1-1-x86_64.pkg.tar.zst.sig">python-3.12.1-1-x86_64.pkg.tar.zst.sig</a> 15-Dec-2023 09:00 566
</pre><hr></body>
</html>
"""
class TestParseArchiveHtml(unittest.TestCase):
def test_parse_glibc_entries(self) -> None:
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
self.assertEqual(len(entries), 4)
def test_parse_excludes_sig_files(self) -> None:
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
for e in entries:
self.assertFalse(e.filename.endswith('.sig'))
def test_parse_extracts_version(self) -> None:
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
versions = [e.version for e in entries]
self.assertIn('2.37-3', versions)
self.assertIn('2.38-7', versions)
self.assertIn('2.39-1', versions)
self.assertIn('2.41-1', versions)
def test_parse_extracts_date(self) -> None:
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
by_version = {e.version: e for e in entries}
self.assertEqual(by_version['2.39-1'].date, datetime.date(2024, 2, 2))
self.assertEqual(by_version['2.41-1'].date, datetime.date(2025, 1, 28))
def test_parse_extracts_filename(self) -> None:
entries = pacman_manager_t.parse_archive_listing('glibc', SAMPLE_NGINX_HTML)
by_version = {e.version: e for e in entries}
self.assertEqual(
by_version['2.39-1'].filename,
'glibc-2.39-1-x86_64.pkg.tar.zst',
)
def test_parse_empty_html(self) -> None:
html = '<html><body><pre></pre></body></html>'
entries = pacman_manager_t.parse_archive_listing('glibc', html)
self.assertEqual(entries, [])
def test_parse_python_package(self) -> None:
entries = pacman_manager_t.parse_archive_listing('python', SAMPLE_NGINX_HTML_PYTHON)
self.assertEqual(len(entries), 2)
versions = [e.version for e in entries]
self.assertIn('3.11.5-1', versions)
self.assertIn('3.12.1-1', versions)
class TestArchiveVersionTable(unittest.TestCase):
def setUp(self) -> None:
self.conn = sqlite3.connect(':memory:')
cache_db_t.migrate(self.conn, 0, cache_db_t.schema_version())
self.db = cache_db_t(self.conn)
def tearDown(self) -> None:
self.conn.close()
def test_upsert_archive_version(self) -> None:
self.db.upsert_archive_version(
name='glibc',
version='2.39-1',
archive_date=datetime.date(2024, 2, 2),
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
rows = list(self.db.list_archive_versions('glibc'))
self.assertEqual(len(rows), 1)
self.assertEqual(rows[0].version, '2.39-1')
self.assertEqual(rows[0].status, archive_version_status_t.pending)
def test_upsert_archive_version_idempotent(self) -> None:
for _ in range(3):
self.db.upsert_archive_version(
name='glibc',
version='2.39-1',
archive_date=datetime.date(2024, 2, 2),
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
rows = list(self.db.list_archive_versions('glibc'))
self.assertEqual(len(rows), 1)
def test_mark_synced(self) -> None:
self.db.upsert_archive_version(
name='glibc',
version='2.39-1',
archive_date=datetime.date(2024, 2, 2),
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
self.db.mark_archive_version_synced('glibc', '2.39-1')
rows = list(self.db.list_archive_versions('glibc'))
self.assertEqual(rows[0].status, archive_version_status_t.synced)
def test_list_pending(self) -> None:
self.db.upsert_archive_version(
name='glibc',
version='2.38-7',
archive_date=datetime.date(2023, 12, 3),
filename='glibc-2.38-7-x86_64.pkg.tar.zst',
)
self.db.upsert_archive_version(
name='glibc',
version='2.39-1',
archive_date=datetime.date(2024, 2, 2),
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
self.db.mark_archive_version_synced('glibc', '2.38-7')
pending = list(self.db.list_pending_archive_versions())
self.assertEqual(len(pending), 1)
self.assertEqual(pending[0].name, 'glibc')
self.assertEqual(pending[0].version, '2.39-1')
def test_find_archive_date_for_version(self) -> None:
self.db.upsert_archive_version(
name='glibc',
version='2.39-1',
archive_date=datetime.date(2024, 2, 2),
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
row = self.db.find_archive_version('glibc', '2.39-1')
self.assertIsNotNone(row)
self.assertEqual(row.archive_date, datetime.date(2024, 2, 2))
def test_find_archive_version_not_found(self) -> None:
row = self.db.find_archive_version('glibc', '9.99-1')
self.assertIsNone(row)
def test_bulk_upsert(self) -> None:
entries = [
archive_entry_t(
name='glibc',
version='2.38-7',
filename='glibc-2.38-7-x86_64.pkg.tar.zst',
date=datetime.date(2023, 12, 3),
),
archive_entry_t(
name='glibc',
version='2.39-1',
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
date=datetime.date(2024, 2, 2),
),
]
self.db.bulk_upsert_archive_versions(entries)
rows = list(self.db.list_archive_versions('glibc'))
self.assertEqual(len(rows), 2)
def test_pending_dates_for_reference(self) -> None:
"""Given a reference dict, find which versions are missing from cache
and return dates that need syncing."""
self.db.upsert_archive_version(
name='glibc',
version='2.39-1',
archive_date=datetime.date(2024, 2, 2),
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
self.db.upsert_archive_version(
name='python',
version='3.12.1-1',
archive_date=datetime.date(2023, 12, 15),
filename='python-3.12.1-1-x86_64.pkg.tar.zst',
)
# glibc synced, python not
self.db.mark_archive_version_synced('glibc', '2.39-1')
pending = list(self.db.list_pending_archive_versions())
dates = {r.archive_date for r in pending}
self.assertIn(datetime.date(2023, 12, 15), dates)
self.assertNotIn(datetime.date(2024, 2, 2), dates)
class TestSyncReference(unittest.TestCase):
"""Tests for sync_reference: fetches archive listings, populates
archive_versions, determines dates to sync, syncs them."""
def setUp(self) -> None:
self.conn = sqlite3.connect(':memory:')
cache_db_t.migrate(self.conn, 0, cache_db_t.schema_version())
self.db = cache_db_t(self.conn)
self.mgr = pacman_manager_t()
self._tmp = tempfile.TemporaryDirectory()
self.cache_dir = pathlib.Path(self._tmp.name)
def tearDown(self) -> None:
self.conn.close()
self._tmp.cleanup()
def _mock_fetch_listing(self, responses: dict[str, str]) -> unittest.mock.MagicMock:
"""Mock _fetch_archive_page to return canned HTML per package name."""
def side_effect(pkg_name: str) -> str:
return responses.get(pkg_name, '<html><body><pre></pre></body></html>')
return unittest.mock.patch.object(
self.mgr, '_fetch_archive_page', side_effect=side_effect,
)
def test_sync_reference_discovers_versions(self) -> None:
reference = {'glibc': '2.39-1', 'python': '3.12.1-1'}
with self._mock_fetch_listing({
'glibc': SAMPLE_NGINX_HTML,
'python': SAMPLE_NGINX_HTML_PYTHON,
}):
with unittest.mock.patch.object(self.mgr, 'sync_date'):
self.mgr.sync_reference(
reference=reference,
cache_dir=self.cache_dir,
cache_db=self.db,
)
glibc_row = self.db.find_archive_version('glibc', '2.39-1')
self.assertIsNotNone(glibc_row)
self.assertEqual(glibc_row.archive_date, datetime.date(2024, 2, 2))
python_row = self.db.find_archive_version('python', '3.12.1-1')
self.assertIsNotNone(python_row)
self.assertEqual(python_row.archive_date, datetime.date(2023, 12, 15))
def test_sync_reference_calls_sync_date(self) -> None:
reference = {'glibc': '2.39-1'}
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}):
with unittest.mock.patch.object(self.mgr, 'sync_date') as mock_sync:
self.mgr.sync_reference(
reference=reference,
cache_dir=self.cache_dir,
cache_db=self.db,
)
# should sync the date corresponding to glibc 2.39-1 (2024/02/02)
self.assertGreater(mock_sync.call_count, 0)
synced_dates = {call.kwargs.get('date') or call.args[0] for call in mock_sync.call_args_list}
self.assertIn('2024/02/02', synced_dates)
def test_sync_reference_skips_already_cached(self) -> None:
"""If a (name, version) already exists in packages table, skip it."""
from ..apps.pacman.types import repo_index_t, package_desc_t
# pre-populate: glibc 2.39-1 is already in a cached snapshot
snapshot_id = self.db.upsert_snapshot(
date='2024/02/02',
repo='core',
arch='x86_64',
db_sha256='abc123',
)
idx = repo_index_t(name='core')
idx.packages['glibc'] = package_desc_t(
name='glibc', version='2.39-1',
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
self.db.store_index(snapshot_id=snapshot_id, index=idx)
reference = {'glibc': '2.39-1'}
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}) as mock_fetch:
with unittest.mock.patch.object(self.mgr, 'sync_date'):
self.mgr.sync_reference(
reference=reference,
cache_dir=self.cache_dir,
cache_db=self.db,
)
# should not have fetched archive page since version is already in packages
mock_fetch.assert_not_called()
def test_sync_reference_marks_synced(self) -> None:
from ..apps.pacman.types import repo_index_t, package_desc_t
reference = {'glibc': '2.39-1'}
def fake_sync_date(**kwargs: object) -> None:
# simulate what sync_date does: insert a package into the db
snapshot_id = self.db.upsert_snapshot(
date='2024/02/02', repo='core', arch='x86_64', db_sha256='fake',
)
idx = repo_index_t(name='core')
idx.packages['glibc'] = package_desc_t(
name='glibc', version='2.39-1',
filename='glibc-2.39-1-x86_64.pkg.tar.zst',
)
self.db.store_index(snapshot_id=snapshot_id, index=idx)
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}):
with unittest.mock.patch.object(self.mgr, 'sync_date', side_effect=fake_sync_date):
self.mgr.sync_reference(
reference=reference,
cache_dir=self.cache_dir,
cache_db=self.db,
)
row = self.db.find_archive_version('glibc', '2.39-1')
self.assertIsNotNone(row)
self.assertEqual(row.status, archive_version_status_t.synced)
def test_sync_reference_version_not_in_archive(self) -> None:
"""If the version isn't found in archive listing, log warning, no crash."""
reference = {'glibc': '9.99.99-1'}
with self._mock_fetch_listing({'glibc': SAMPLE_NGINX_HTML}):
with unittest.mock.patch.object(self.mgr, 'sync_date') as mock_sync:
self.mgr.sync_reference(
reference=reference,
cache_dir=self.cache_dir,
cache_db=self.db,
)
mock_sync.assert_not_called()
def test_sync_reference_empty(self) -> None:
with unittest.mock.patch.object(self.mgr, 'sync_date') as mock_sync:
self.mgr.sync_reference(
reference={},
cache_dir=self.cache_dir,
cache_db=self.db,
)
mock_sync.assert_not_called()