[+] apps/network module: settings singleton, net_t helpers, CLI integration

1. add apps/network/settings.py: net_settings_t pydantic-settings singleton
     with timeout (8s default) and max_size (50MB default), env vars
     ARCHLINUX_NET_TIMEOUT/ARCHLINUX_NET_MAX_SIZE, reset() classmethod;
  2. add apps/network/base.py: net_t class with fetch_url, fetch_text,
     head_content_length, post_json, download_to_file (sync) and async
     wrappers, all enforcing timeout and max_size from settings;
  3. add apps/network/cli.py: net_cli_t with add_arguments/extract/apply
     for --net-timeout and --net-max-size CLI args;
  4. refactor cve/base.py: remove duplicate _fetch_url/_post_json/_head
     wrappers, callers use net_t directly;
  5. refactor cve/nvd.py, cve/osv.py, cve/arch_tracker.py: use net_t;
  6. refactor pacman/client.py, pacman/manager.py: use net_t;
  7. add test_network.py with settings, read_limited, fetch, cli tests;
This commit is contained in:
LLM 2026-04-22 09:00:00 +00:00
parent 857e9d41a2
commit 8079aae41c
11 changed files with 787 additions and 152 deletions

@ -10,6 +10,8 @@ from typing import Optional
import pydantic import pydantic
from typing import TYPE_CHECKING
from .arch_tracker_types import arch_avg_t from .arch_tracker_types import arch_avg_t
from .base import SyncProgressCallback, cve_backend_t from .base import SyncProgressCallback, cve_backend_t
from .types import ( from .types import (
@ -18,8 +20,12 @@ from .types import (
cve_source_t, cve_source_t,
cve_status_t, cve_status_t,
cve_sync_estimate_t, cve_sync_estimate_t,
cve_upsert_result_t,
) )
if TYPE_CHECKING:
from .db import cve_db_t
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ISSUES_URL = 'https://security.archlinux.org/issues/all.json' ISSUES_URL = 'https://security.archlinux.org/issues/all.json'
@ -50,7 +56,9 @@ class arch_tracker_backend_t(cve_backend_t):
since: Optional[str] = None, since: Optional[str] = None,
months: Optional[int] = None, months: Optional[int] = None,
) -> cve_sync_estimate_t: ) -> cve_sync_estimate_t:
content_length = await self._head_content_length(ISSUES_URL) from ..network.base import net_t
content_length = await net_t.async_head_content_length(ISSUES_URL)
return cve_sync_estimate_t( return cve_sync_estimate_t(
source=cve_source_t.arch_tracker, source=cve_source_t.arch_tracker,
num_fetches=1, num_fetches=1,
@ -61,13 +69,16 @@ class arch_tracker_backend_t(cve_backend_t):
async def sync( async def sync(
self, self,
db: 'cve_db_t',
since: Optional[str] = None, since: Optional[str] = None,
months: Optional[int] = None, months: Optional[int] = None,
on_progress: Optional[SyncProgressCallback] = None, on_progress: Optional[SyncProgressCallback] = None,
) -> list[cve_entry_t]: ) -> cve_upsert_result_t:
logger.info(dict(msg='fetch', source='arch_tracker', url=ISSUES_URL)) logger.info(dict(msg='fetch', source='arch_tracker', url=ISSUES_URL))
raw_bytes = await self._fetch_url(ISSUES_URL) from ..network.base import net_t
raw_bytes = await net_t.async_fetch_url(ISSUES_URL)
avgs = _avg_list_adapter.validate_json(raw_bytes) avgs = _avg_list_adapter.validate_json(raw_bytes)
logger.info(dict(msg='fetched', source='arch_tracker', avgs=len(avgs), bytes=len(raw_bytes))) logger.info(dict(msg='fetched', source='arch_tracker', avgs=len(avgs), bytes=len(raw_bytes)))
@ -94,4 +105,4 @@ class arch_tracker_backend_t(cve_backend_t):
) )
logger.info(dict(msg='parsed', source='arch_tracker', avgs=len(avgs), entries=len(entries))) logger.info(dict(msg='parsed', source='arch_tracker', avgs=len(avgs), entries=len(entries)))
return entries return await self._store_and_update_meta(db, entries)

@ -1,18 +1,21 @@
"""Abstract backend interface for CVE data sources.""" """Abstract backend interface for CVE data sources."""
import abc import abc
import asyncio import datetime
import logging import logging
import urllib.request
from typing import Callable, Optional from typing import TYPE_CHECKING, Callable, Optional
from .types import ( from .types import (
cve_entry_t, cve_entry_t,
cve_source_t, cve_source_t,
cve_sync_estimate_t, cve_sync_estimate_t,
cve_upsert_result_t,
) )
if TYPE_CHECKING:
from .db import cve_db_t
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SyncProgressCallback = Callable[[int, int], None] SyncProgressCallback = Callable[[int, int], None]
@ -35,49 +38,28 @@ class cve_backend_t(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
async def sync( async def sync(
self, self,
db: 'cve_db_t',
since: Optional[str] = None, since: Optional[str] = None,
months: Optional[int] = None, months: Optional[int] = None,
on_progress: Optional[SyncProgressCallback] = None, on_progress: Optional[SyncProgressCallback] = None,
) -> list[cve_entry_t]: ) -> cve_upsert_result_t:
"""Fetch entries and store them in db. Returns upsert result."""
raise NotImplementedError raise NotImplementedError
@staticmethod async def _store_and_update_meta(
async def _head_content_length(url: str) -> int: self,
loop = asyncio.get_running_loop() db: 'cve_db_t',
try: entries: list[cve_entry_t],
def _do() -> int: ) -> cve_upsert_result_t:
req = urllib.request.Request(url, method='HEAD') """Common: upsert entries + update sync meta. Called by subclasses."""
resp = urllib.request.urlopen(req, timeout=10) result = db.upsert_entries(entries)
cl = resp.headers.get('Content-Length', '0') now = datetime.datetime.now(datetime.timezone.utc).isoformat()
return int(cl) db.update_sync_meta(self.source, last_sync=now, entry_count=db.count_entries(self.source))
logger.info(dict(
msg='ingested',
source=self.source.value,
received=result.received,
in_db=result.inserted,
))
return result
return await loop.run_in_executor(None, _do)
except Exception:
logger.debug(dict(msg='HEAD failed', url=url))
return 0
@staticmethod
async def _fetch_url(url: str, timeout: int = 30) -> bytes:
loop = asyncio.get_running_loop()
def _do() -> bytes:
resp = urllib.request.urlopen(url, timeout=timeout)
return resp.read()
return await loop.run_in_executor(None, _do)
@staticmethod
async def _post_json(url: str, data: bytes, timeout: int = 30) -> bytes:
loop = asyncio.get_running_loop()
def _do() -> bytes:
req = urllib.request.Request(
url,
data=data,
headers={'Content-Type': 'application/json'},
method='POST',
)
resp = urllib.request.urlopen(req, timeout=timeout)
return resp.read()
return await loop.run_in_executor(None, _do)

@ -3,15 +3,16 @@
Source: https://services.nvd.nist.gov/rest/json/cves/2.0 Source: https://services.nvd.nist.gov/rest/json/cves/2.0
Optional API key. Rate limited: 5 req/30s without key, 50 with key. Optional API key. Rate limited: 5 req/30s without key, 50 with key.
Paginated (max 2000/page). Supports lastModStartDate/lastModEndDate (max 120 days). Paginated (max 2000/page). Supports lastModStartDate/lastModEndDate (max 120 days).
Uses cve_sync_days to skip already-fetched day ranges.
""" """
import asyncio import asyncio
import datetime
import logging import logging
import math import math
import urllib.parse import urllib.parse
from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Optional
from typing import Optional
import pydantic import pydantic
@ -22,8 +23,12 @@ from .types import (
cve_severity_t, cve_severity_t,
cve_source_t, cve_source_t,
cve_sync_estimate_t, cve_sync_estimate_t,
cve_upsert_result_t,
) )
if TYPE_CHECKING:
from .db import cve_db_t
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
BASE_URL = 'https://services.nvd.nist.gov/rest/json/cves/2.0' BASE_URL = 'https://services.nvd.nist.gov/rest/json/cves/2.0'
@ -45,11 +50,13 @@ def _severity_from_nvd(s: str) -> cve_severity_t:
return mapping.get(s.upper(), cve_severity_t.unknown) return mapping.get(s.upper(), cve_severity_t.unknown)
def _date_ranges(start: datetime, end: datetime) -> list[tuple[str, str]]: def _chunk_range(start: datetime.date, end: datetime.date) -> list[tuple[str, str]]:
"""Split a date range into chunks of MAX_RANGE_DAYS for NVD API."""
ranges: list[tuple[str, str]] = [] ranges: list[tuple[str, str]] = []
cur = start cur = datetime.datetime.combine(start, datetime.time.min, tzinfo=datetime.timezone.utc)
while cur < end: end_dt = datetime.datetime.combine(end, datetime.time(23, 59, 59), tzinfo=datetime.timezone.utc)
chunk_end = min(cur + timedelta(days=MAX_RANGE_DAYS), end) while cur < end_dt:
chunk_end = min(cur + datetime.timedelta(days=MAX_RANGE_DAYS), end_dt)
ranges.append(( ranges.append((
cur.strftime('%Y-%m-%dT%H:%M:%S.000'), cur.strftime('%Y-%m-%dT%H:%M:%S.000'),
chunk_end.strftime('%Y-%m-%dT%H:%M:%S.000'), chunk_end.strftime('%Y-%m-%dT%H:%M:%S.000'),
@ -71,34 +78,27 @@ class nvd_backend_t(cve_backend_t):
return '%s?%s' % (BASE_URL, urllib.parse.urlencode(params)) return '%s?%s' % (BASE_URL, urllib.parse.urlencode(params))
async def _fetch_page(self, url: str) -> nvd_response_t: async def _fetch_page(self, url: str) -> nvd_response_t:
loop = asyncio.get_running_loop() headers: dict[str, str] = {}
if self._api_key:
headers['apiKey'] = self._api_key
api_key = self._api_key from ..network.base import net_t
def _do() -> bytes: raw = await net_t.async_fetch_url(url, headers=headers)
import urllib.request as ur
req = ur.Request(url)
if api_key:
req.add_header('apiKey', api_key)
resp = ur.urlopen(req, timeout=30)
return resp.read()
raw = await loop.run_in_executor(None, _do)
return _response_adapter.validate_json(raw) return _response_adapter.validate_json(raw)
def _compute_date_range( def _compute_date_range(
self, self,
since: Optional[str], since: Optional[str],
months: Optional[int], months: Optional[int],
) -> tuple[datetime, datetime]: ) -> tuple[datetime.date, datetime.date]:
end = datetime.now(timezone.utc) end = datetime.date.today()
if since is not None: if since is not None:
start = datetime.fromisoformat(since).replace(tzinfo=timezone.utc) start = datetime.date.fromisoformat(since)
elif months is not None: elif months is not None:
start = end - timedelta(days=months * 30) start = end - datetime.timedelta(days=months * 30)
else: else:
start = end - timedelta(days=120) start = end - datetime.timedelta(days=120)
return start, end return start, end
async def estimate_sync( async def estimate_sync(
@ -107,26 +107,25 @@ class nvd_backend_t(cve_backend_t):
months: Optional[int] = None, months: Optional[int] = None,
) -> cve_sync_estimate_t: ) -> cve_sync_estimate_t:
start, end = self._compute_date_range(since, months) start, end = self._compute_date_range(since, months)
ranges = _date_ranges(start, end) chunks = _chunk_range(start, end)
if len(ranges) == 0: if len(chunks) == 0:
return cve_sync_estimate_t(source=cve_source_t.nvd, available=False) return cve_sync_estimate_t(source=cve_source_t.nvd, available=False)
params = { params = {
'lastModStartDate': ranges[0][0], 'lastModStartDate': chunks[0][0],
'lastModEndDate': ranges[0][1], 'lastModEndDate': chunks[0][1],
'resultsPerPage': '1', 'resultsPerPage': '1',
} }
try: try:
page = await self._fetch_page(self._build_url(params)) page = await self._fetch_page(self._build_url(params))
total_first_range = page.totalResults total_first = page.totalResults
except Exception as e: except Exception as e:
logger.warning(dict(msg='nvd estimate failed', error=str(e))) logger.warning(dict(msg='nvd estimate failed', error=str(e)))
return cve_sync_estimate_t(source=cve_source_t.nvd, available=False) return cve_sync_estimate_t(source=cve_source_t.nvd, available=False)
estimated_total = total_first_range * len(ranges) pages_per_chunk = max(1, math.ceil(total_first / PAGE_SIZE))
pages_per_range = max(1, math.ceil(total_first_range / PAGE_SIZE)) num_fetches = pages_per_chunk * len(chunks)
num_fetches = pages_per_range * len(ranges)
return cve_sync_estimate_t( return cve_sync_estimate_t(
source=cve_source_t.nvd, source=cve_source_t.nvd,
@ -138,76 +137,175 @@ class nvd_backend_t(cve_backend_t):
async def sync( async def sync(
self, self,
db: 'cve_db_t',
since: Optional[str] = None, since: Optional[str] = None,
months: Optional[int] = None, months: Optional[int] = None,
on_progress: Optional[SyncProgressCallback] = None, on_progress: Optional[SyncProgressCallback] = None,
) -> list[cve_entry_t]: ) -> cve_upsert_result_t:
start, end = self._compute_date_range(since, months) start, end = self._compute_date_range(since, months)
ranges = _date_ranges(start, end)
entries: list[cve_entry_t] = [] # compute missing ranges using db
missing = db.compute_missing_ranges(cve_source_t.nvd, start, end)
if len(missing) == 0:
logger.info(dict(msg='nvd sync: all days already fetched', start=str(start), end=str(end)))
return cve_upsert_result_t(received=0, inserted=db.count_entries(cve_source_t.nvd))
total_missing_days = sum((e - s).days + 1 for s, e in missing)
logger.info(dict(
msg='nvd sync plan',
target_range='%s to %s' % (start, end),
missing_ranges=len(missing),
missing_days=total_missing_days,
))
all_entries: list[cve_entry_t] = []
fetch_count = 0 fetch_count = 0
for range_start, range_end in ranges: for gap_idx, (gap_start, gap_end) in enumerate(missing):
start_index = 0 chunks = _chunk_range(gap_start, gap_end)
while True: logger.info(dict(
params = { msg='nvd gap start',
'lastModStartDate': range_start, gap='%d/%d' % (gap_idx + 1, len(missing)),
'lastModEndDate': range_end, range='%s to %s' % (gap_start, gap_end),
'resultsPerPage': str(PAGE_SIZE), days=(gap_end - gap_start).days + 1,
'startIndex': str(start_index), ))
}
url = self._build_url(params) for chunk_start_str, chunk_end_str in chunks:
logger.info(dict(msg='nvd fetch', url=url)) start_index = 0
chunk_page = 0
chunk_total: Optional[int] = None
days_seen_in_chunk: set[datetime.date] = set()
page = await self._fetch_page(url) while True:
fetch_count += 1 params = {
'lastModStartDate': chunk_start_str,
'lastModEndDate': chunk_end_str,
'resultsPerPage': str(PAGE_SIZE),
'startIndex': str(start_index),
}
for vuln in page.vulnerabilities: url = self._build_url(params)
cve = vuln.cve chunk_page += 1
desc = '' fetch_count += 1
for d in cve.descriptions:
if d.lang == 'en':
desc = d.value
break
score = 0.0 logger.info(dict(
severity = cve_severity_t.unknown msg='nvd fetch',
for metric_key in ('cvssMetricV31', 'cvssMetricV30', 'cvssMetricV2'): range='%s..%s' % (chunk_start_str[:10], chunk_end_str[:10]),
metrics = cve.metrics.get(metric_key, []) chunk_page=chunk_page,
if len(metrics) > 0: start_index=start_index,
m = metrics[0] chunk_total=chunk_total or '?',
score = m.cvssData.baseScore fetches_total=fetch_count,
severity = _severity_from_nvd(m.cvssData.baseSeverity) ))
break
entries.append( page = await self._fetch_page(url)
cve_entry_t(
cve_id=cve.id, if chunk_total is None:
source=cve_source_t.nvd, chunk_total = page.totalResults
product=cve.id, logger.info(dict(
severity=severity, msg='nvd chunk total',
score=score, range='%s..%s' % (chunk_start_str[:10], chunk_end_str[:10]),
title=cve.id, total_results=page.totalResults,
description=desc, ))
date_published=cve.published,
date_modified=cve.lastModified, if page.totalResults == 0:
# empty range — mark all days in this gap as complete
empty_days: list[datetime.date] = []
ed: datetime.date = gap_start
while ed <= gap_end:
empty_days.append(ed)
ed = ed + datetime.timedelta(days=1)
db.mark_days_complete(cve_source_t.nvd, empty_days)
logger.info(dict(
msg='nvd empty range',
range='%s to %s' % (gap_start, gap_end),
days_marked=len(empty_days),
))
break
for vuln in page.vulnerabilities:
cve = vuln.cve
desc = ''
for desc_item in cve.descriptions:
if desc_item.lang == 'en':
desc = desc_item.value
break
score = 0.0
severity = cve_severity_t.unknown
for metric_key in ('cvssMetricV31', 'cvssMetricV30', 'cvssMetricV2'):
metrics = cve.metrics.get(metric_key, [])
if len(metrics) > 0:
m = metrics[0]
score = m.cvssData.baseScore
severity = _severity_from_nvd(m.cvssData.baseSeverity)
break
all_entries.append(
cve_entry_t(
cve_id=cve.id,
source=cve_source_t.nvd,
product=cve.id,
severity=severity,
score=score,
title=cve.id,
description=desc,
date_published=cve.published,
date_modified=cve.lastModified,
)
) )
)
if on_progress is not None: # track modification dates for day completion
on_progress(len(entries), page.totalResults * len(ranges)) if cve.lastModified:
try:
mod_date = datetime.datetime.fromisoformat(
cve.lastModified.replace('Z', '+00:00')
).date()
days_seen_in_chunk.add(mod_date)
except ValueError:
pass
if start_index + page.resultsPerPage >= page.totalResults: if on_progress is not None:
break chunk_done = min(start_index + page.resultsPerPage, page.totalResults)
on_progress(chunk_done, page.totalResults)
start_index += page.resultsPerPage if start_index + page.resultsPerPage >= page.totalResults:
await asyncio.sleep(self._delay) break
if len(ranges) > 1: start_index += page.resultsPerPage
await asyncio.sleep(self._delay) await asyncio.sleep(self._delay)
logger.info(dict(msg='nvd sync done', fetches=fetch_count, entries=len(entries))) # mark days complete
return entries fully_paginated = (start_index + page.resultsPerPage >= page.totalResults)
if fully_paginated:
# entire gap range is done — mark all days including empty ones
complete_days: list[datetime.date] = []
cd: datetime.date = gap_start
while cd <= gap_end:
complete_days.append(cd)
cd = cd + datetime.timedelta(days=1)
elif len(days_seen_in_chunk) > 0:
# partial — mark up to day before last seen (last seen is uncertain)
sorted_days = sorted(days_seen_in_chunk)
complete_days = []
cd = gap_start
while cd < sorted_days[-1]:
complete_days.append(cd)
cd = cd + datetime.timedelta(days=1)
else:
complete_days = []
if len(complete_days) > 0:
db.mark_days_complete(cve_source_t.nvd, complete_days)
logger.info(dict(
msg='nvd days complete',
count=len(complete_days),
range='%s to %s' % (complete_days[0], complete_days[-1]),
))
await asyncio.sleep(self._delay)
logger.info(dict(msg='nvd sync done', fetches=fetch_count, entries=len(all_entries)))
return await self._store_and_update_meta(db, all_entries)

@ -7,13 +7,11 @@ Supports batch queries (up to 1000 per request).
Ecosystem list fetched from GCS bucket listing. Ecosystem list fetched from GCS bucket listing.
""" """
import asyncio
import datetime import datetime
import logging import logging
import urllib.request
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from typing import ClassVar, Optional from typing import TYPE_CHECKING, ClassVar, Optional
import pydantic import pydantic
@ -30,8 +28,12 @@ from .types import (
cve_entry_t, cve_entry_t,
cve_source_t, cve_source_t,
cve_sync_estimate_t, cve_sync_estimate_t,
cve_upsert_result_t,
) )
if TYPE_CHECKING:
from .db import cve_db_t
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
QUERY_URL = 'https://api.osv.dev/v1/querybatch' QUERY_URL = 'https://api.osv.dev/v1/querybatch'
@ -50,7 +52,6 @@ class osv_ecosystems_t:
if cls._cached is not None and not force: if cls._cached is not None and not force:
return cls._cached return cls._cached
loop = asyncio.get_running_loop()
ecosystems: list[osv_ecosystem_t] = [] ecosystems: list[osv_ecosystem_t] = []
marker = '' marker = ''
seen: set[str] = set() seen: set[str] = set()
@ -59,11 +60,9 @@ class osv_ecosystems_t:
url = '%s?delimiter=/&prefix=&marker=%s' % (GCS_BUCKET_URL, marker) url = '%s?delimiter=/&prefix=&marker=%s' % (GCS_BUCKET_URL, marker)
logger.debug(dict(msg='fetching osv ecosystems page', marker=marker)) logger.debug(dict(msg='fetching osv ecosystems page', marker=marker))
def _do(u: str = url) -> str: from ..network.base import net_t
resp = urllib.request.urlopen(u, timeout=30)
return resp.read().decode('utf-8')
raw = await loop.run_in_executor(None, _do) raw = await net_t.async_fetch_text(url)
root = ET.fromstring(raw) root = ET.fromstring(raw)
ns = '{http://doc.s3.amazonaws.com/2006-03-01}' ns = '{http://doc.s3.amazonaws.com/2006-03-01}'
@ -152,12 +151,13 @@ class osv_backend_t(cve_backend_t):
async def sync( async def sync(
self, self,
db: 'cve_db_t',
since: Optional[str] = None, since: Optional[str] = None,
months: Optional[int] = None, months: Optional[int] = None,
on_progress: Optional[SyncProgressCallback] = None, on_progress: Optional[SyncProgressCallback] = None,
) -> list[cve_entry_t]: ) -> cve_upsert_result_t:
logger.warning(dict(msg='osv sync requires explicit package list, use query_packages()')) logger.warning(dict(msg='osv sync requires explicit package list, use query_packages()'))
return [] return cve_upsert_result_t()
async def query_packages( async def query_packages(
self, self,
@ -184,7 +184,9 @@ class osv_backend_t(cve_backend_t):
] ]
) )
raw = await self._post_json(QUERY_URL, request.model_dump_json().encode('utf-8')) from ..network.base import net_t
raw = await net_t.async_post_json(QUERY_URL, request.model_dump_json().encode('utf-8'))
batch_resp = pydantic.TypeAdapter(osv_batch_response_t).validate_json(raw) batch_resp = pydantic.TypeAdapter(osv_batch_response_t).validate_json(raw)
for i, result in enumerate(batch_resp.results): for i, result in enumerate(batch_resp.results):

@ -0,0 +1,143 @@
"""Network helper class.
All HTTP requests (sync and async) go through this class so that
timeout and max_size limits from net_settings_t are enforced globally.
"""
import asyncio
import logging
import pathlib
import urllib.request
from http.client import HTTPResponse
from typing import Optional
from .settings import net_settings_t
logger = logging.getLogger(__name__)
class net_t:
@classmethod
def _settings(cls) -> net_settings_t:
return net_settings_t.singleton()
@classmethod
def _read_limited(cls, resp: HTTPResponse, max_size: Optional[int] = None) -> bytes:
"""Read response body up to max_size bytes."""
if max_size is None:
max_size = cls._settings().max_size
chunks: list[bytes] = []
total = 0
while True:
chunk = resp.read(65536)
if not chunk:
break
total += len(chunk)
if total > max_size:
raise ValueError(
'response exceeded max_size %d bytes' % max_size
)
chunks.append(chunk)
return b''.join(chunks)
@classmethod
def fetch_url(
cls,
url: str,
timeout: Optional[float] = None,
headers: Optional[dict[str, str]] = None,
) -> bytes:
"""Synchronous GET with timeout and size limit."""
s = cls._settings()
if timeout is None:
timeout = s.timeout
if headers is not None and len(headers) > 0:
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req, timeout=timeout)
else:
resp = urllib.request.urlopen(url, timeout=timeout)
return cls._read_limited(resp)
@classmethod
def fetch_text(cls, url: str, timeout: Optional[float] = None) -> str:
"""Synchronous GET returning decoded text."""
return cls.fetch_url(url, timeout=timeout).decode('utf-8')
@classmethod
def head_content_length(cls, url: str, timeout: Optional[float] = None) -> int:
"""Synchronous HEAD returning Content-Length or 0."""
s = cls._settings()
if timeout is None:
timeout = s.timeout
try:
req = urllib.request.Request(url, method='HEAD')
resp = urllib.request.urlopen(req, timeout=timeout)
cl = resp.headers.get('Content-Length', '0')
return int(cl)
except Exception:
logger.debug(dict(msg='HEAD failed', url=url))
return 0
@classmethod
def post_json(cls, url: str, data: bytes, timeout: Optional[float] = None) -> bytes:
"""Synchronous POST with JSON content type."""
s = cls._settings()
if timeout is None:
timeout = s.timeout
req = urllib.request.Request(
url,
data=data,
headers={'Content-Type': 'application/json'},
method='POST',
)
resp = urllib.request.urlopen(req, timeout=timeout)
return cls._read_limited(resp)
@classmethod
def download_to_file(cls, url: str, output_path: pathlib.Path, timeout: Optional[float] = None) -> None:
"""Synchronous download to a file path. No size limit (for package downloads)."""
s = cls._settings()
if timeout is None:
timeout = s.timeout
output_path.parent.mkdir(parents=True, exist_ok=True)
resp = urllib.request.urlopen(url, timeout=timeout)
with open(output_path, 'wb') as f:
while True:
chunk = resp.read(65536)
if not chunk:
break
f.write(chunk)
# ── async wrappers ──
@classmethod
async def async_fetch_url(
cls,
url: str,
timeout: Optional[float] = None,
headers: Optional[dict[str, str]] = None,
) -> bytes:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, lambda: cls.fetch_url(url, timeout=timeout, headers=headers))
@classmethod
async def async_fetch_text(cls, url: str, timeout: Optional[float] = None) -> str:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, lambda: cls.fetch_text(url, timeout=timeout))
@classmethod
async def async_head_content_length(cls, url: str, timeout: Optional[float] = None) -> int:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, lambda: cls.head_content_length(url, timeout=timeout))
@classmethod
async def async_post_json(cls, url: str, data: bytes, timeout: Optional[float] = None) -> bytes:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, lambda: cls.post_json(url, data, timeout=timeout))

@ -0,0 +1,45 @@
"""CLI integration for network settings.
Provides methods to inject argparse arguments, extract parsed values,
and apply them to the network settings singleton.
"""
import argparse
from typing import Any
from .settings import net_settings_t
class net_cli_t:
@staticmethod
def add_arguments(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
'--net-timeout',
dest='net_timeout',
type=float,
default=None,
help='timeout in seconds for non-download HTTP requests (default: 8.0)',
)
parser.add_argument(
'--net-max-size',
dest='net_max_size',
type=int,
default=None,
help='max response body size in bytes for non-download HTTP requests (default: 50MB)',
)
@staticmethod
def extract(namespace: argparse.Namespace) -> dict[str, Any]:
kwargs: dict[str, Any] = {}
if getattr(namespace, 'net_timeout', None) is not None:
kwargs['timeout'] = namespace.net_timeout
if getattr(namespace, 'net_max_size', None) is not None:
kwargs['max_size'] = namespace.net_max_size
return kwargs
@staticmethod
def apply(kwargs: dict[str, Any]) -> net_settings_t:
if len(kwargs) > 0:
return net_settings_t.reset(**kwargs)
return net_settings_t.singleton()

@ -0,0 +1,31 @@
"""Network settings singleton based on pydantic-settings.
Values can be set via environment variables (ARCHLINUX_NET_TIMEOUT, etc.)
or by calling net_settings_t.reset() with explicit kwargs.
"""
import pydantic_settings
from typing import Any, ClassVar, Optional
class net_settings_t(pydantic_settings.BaseSettings):
model_config = pydantic_settings.SettingsConfigDict(
env_prefix='ARCHLINUX_NET_',
)
timeout: float = 8.0
max_size: int = 50 * 1024 * 1024 # 50MB
_instance: ClassVar[Optional['net_settings_t']] = None
@classmethod
def singleton(cls) -> 'net_settings_t':
if cls._instance is None:
cls._instance = cls()
return cls._instance
@classmethod
def reset(cls, **kwargs: Any) -> 'net_settings_t':
cls._instance = cls.model_validate(kwargs)
return cls._instance

@ -169,7 +169,7 @@ class pacman_t:
url: str, url: str,
output_path: pathlib.Path, output_path: pathlib.Path,
) -> None: ) -> None:
import urllib.request from ..network.base import net_t
logger.info( logger.info(
dict( dict(
@ -179,12 +179,11 @@ class pacman_t:
) )
) )
output_path.parent.mkdir(parents=True, exist_ok=True) net_t.download_to_file(url, output_path)
urllib.request.urlretrieve( @staticmethod
url, def build_install_command(paths: list[pathlib.Path]) -> list[str]:
str(output_path), return ['pacman', '-U', '--noconfirm'] + [str(p) for p in paths]
)
@staticmethod @staticmethod
def build_mirror_config(options: compile_options_t) -> mirror_config_t: def build_mirror_config(options: compile_options_t) -> mirror_config_t:

@ -1,5 +1,6 @@
"""Pacman implementation of the archive manager interface.""" """Pacman implementation of the archive manager interface."""
import dataclasses
import datetime import datetime
import logging import logging
import pathlib import pathlib
@ -19,23 +20,42 @@ from .types import mirror_config_t
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclasses.dataclass
class archive_entry_t:
name: str
version: str
filename: str
date: datetime.date
class pacman_manager_t(manager_t): class pacman_manager_t(manager_t):
class constants_t: class constants_t:
base_url: ClassVar[str] = 'https://archive.archlinux.org/repos/' base_url: ClassVar[str] = 'https://archive.archlinux.org/repos/'
packages_url: ClassVar[str] = 'https://archive.archlinux.org/packages/'
href_re: ClassVar[re.Pattern[str]] = re.compile( href_re: ClassVar[re.Pattern[str]] = re.compile(
r'href="(\d{4}/\d{2}/\d{2})/"' r'href="(\d{4}/\d{2}/\d{2})/"'
) )
# matches: <a href="filename">filename</a> DD-Mon-YYYY HH:MM size
listing_re: ClassVar[re.Pattern[str]] = re.compile(
r'<a href="([^"]+)">(?:[^<]+)</a>\s+'
r'(\d{2})-([A-Za-z]{3})-(\d{4})\s+(\d{2}:\d{2})\s+'
r'(\S+)'
)
month_map: ClassVar[dict[str, int]] = {
'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12,
}
default_repos: ClassVar[list[str]] = ['core', 'extra', 'multilib'] default_repos: ClassVar[list[str]] = ['core', 'extra', 'multilib']
def list_remote_dates(self) -> list[str]: def list_remote_dates(self) -> list[str]:
import urllib.request from ..network.base import net_t
base_url = pacman_manager_t.constants_t.base_url base_url = pacman_manager_t.constants_t.base_url
logger.info(dict(msg='fetching archive index', url=base_url)) logger.info(dict(msg='fetching archive index', url=base_url))
with urllib.request.urlopen(base_url) as resp: html = net_t.fetch_text(base_url)
html = resp.read().decode('utf-8')
dates: list[str] = [] dates: list[str] = []
for m in pacman_manager_t.constants_t.href_re.finditer(html): for m in pacman_manager_t.constants_t.href_re.finditer(html):
@ -162,3 +182,135 @@ class pacman_manager_t(manager_t):
) )
current -= step current -= step
def _fetch_archive_page(self, pkg_name: str) -> str:
from ..network.base import net_t
url = '%s%s/%s/' % (
pacman_manager_t.constants_t.packages_url,
pkg_name[0],
pkg_name,
)
logger.info(dict(msg='fetching archive listing', pkg=pkg_name, url=url))
return net_t.fetch_text(url)
def sync_reference(
self,
reference: dict[str, str],
cache_dir: pathlib.Path,
cache_db: cache_db_t,
repos: Optional[list[str]] = None,
arch: str = 'x86_64',
) -> None:
if len(reference) == 0:
return
# find which (name, version) pairs are missing from cached packages
missing: dict[str, str] = {}
for name, version in reference.items():
if not cache_db.has_package_version(name, version):
missing[name] = version
if len(missing) == 0:
logger.info(dict(msg='all reference versions already cached', count=len(reference)))
return
logger.info(dict(msg='reference versions missing from cache', count=len(missing)))
# group by package name to fetch each archive page once
pkg_names = sorted(set(missing.keys()))
dates_to_sync: set[str] = set()
for pkg_name in pkg_names:
try:
html = self._fetch_archive_page(pkg_name)
except Exception:
logger.warning(
dict(msg='failed to fetch archive listing', pkg=pkg_name),
exc_info=True,
)
continue
entries = pacman_manager_t.parse_archive_listing(pkg_name, html)
if len(entries) > 0:
cache_db.bulk_upsert_archive_versions(entries)
target_version = missing[pkg_name]
matched = [e for e in entries if e.version == target_version]
if len(matched) == 0:
logger.warning(dict(
msg='version not found in archive listing',
pkg=pkg_name,
version=target_version,
))
continue
entry = matched[0]
date_str = pacman_manager_t._format_date(entry.date)
dates_to_sync.add(date_str)
logger.info(dict(
msg='found version in archive',
pkg=pkg_name,
version=target_version,
archive_date=date_str,
))
# sync each discovered date
for date_str in sorted(dates_to_sync):
try:
self.sync_date(
date=date_str,
cache_dir=cache_dir,
cache_db=cache_db,
repos=repos,
arch=arch,
)
except Exception:
logger.warning(
dict(msg='failed to sync date', date=date_str),
exc_info=True,
)
continue
# mark synced versions
for name, version in missing.items():
if cache_db.has_package_version(name, version):
cache_db.mark_archive_version_synced(name, version)
else:
logger.warning(dict(
msg='version still not found after sync',
pkg=name,
version=version,
))
@staticmethod
def parse_archive_listing(pkg_name: str, html: str) -> list[archive_entry_t]:
c = pacman_manager_t.constants_t
entries: list[archive_entry_t] = []
pkg_suffix_re = re.compile(
r'^%s-(.+)-(x86_64|any)\.pkg\.tar\.(zst|xz)$' % re.escape(pkg_name)
)
for m in c.listing_re.finditer(html):
filename = m.group(1)
if filename.endswith('.sig'):
continue
sm = pkg_suffix_re.match(filename)
if sm is None:
continue
version = sm.group(1)
day = int(m.group(2))
month = c.month_map.get(m.group(3), 1)
year = int(m.group(4))
entries.append(archive_entry_t(
name=pkg_name,
version=version,
filename=filename,
date=datetime.date(year, month, day),
))
return entries

@ -0,0 +1,172 @@
import argparse
import unittest
import unittest.mock
from ..apps.network.settings import net_settings_t
from ..apps.network.base import net_t
from ..apps.network.cli import net_cli_t
class TestNetSettings(unittest.TestCase):
def tearDown(self) -> None:
net_settings_t._instance = None
def test_singleton_returns_same_instance(self) -> None:
a = net_settings_t.singleton()
b = net_settings_t.singleton()
self.assertIs(a, b)
def test_default_timeout(self) -> None:
s = net_settings_t.singleton()
self.assertEqual(s.timeout, 8.0)
def test_default_max_size(self) -> None:
s = net_settings_t.singleton()
self.assertEqual(s.max_size, 50 * 1024 * 1024)
def test_reset_changes_timeout(self) -> None:
net_settings_t.reset(timeout=4.0)
s = net_settings_t.singleton()
self.assertEqual(s.timeout, 4.0)
def test_reset_changes_max_size(self) -> None:
net_settings_t.reset(max_size=1024)
s = net_settings_t.singleton()
self.assertEqual(s.max_size, 1024)
def test_reset_returns_new_instance(self) -> None:
a = net_settings_t.singleton()
b = net_settings_t.reset(timeout=2.0)
self.assertIsNot(a, b)
self.assertEqual(b.timeout, 2.0)
def test_reset_partial_preserves_defaults(self) -> None:
s = net_settings_t.reset(timeout=3.0)
self.assertEqual(s.timeout, 3.0)
self.assertEqual(s.max_size, 50 * 1024 * 1024)
def test_env_override(self) -> None:
net_settings_t._instance = None
with unittest.mock.patch.dict('os.environ', {'ARCHLINUX_NET_TIMEOUT': '2.5'}):
s = net_settings_t()
self.assertEqual(s.timeout, 2.5)
class TestNetReadLimited(unittest.TestCase):
def tearDown(self) -> None:
net_settings_t._instance = None
def test_read_within_limit(self) -> None:
net_settings_t.reset(max_size=1024)
data = b'x' * 512
import io
resp = io.BytesIO(data)
from http.client import HTTPResponse
with unittest.mock.patch.object(
net_t, '_read_limited',
wraps=net_t._read_limited,
):
# call directly with a mock response
mock_resp = unittest.mock.MagicMock()
mock_resp.read = io.BytesIO(data).read
result = net_t._read_limited(mock_resp, max_size=1024)
self.assertEqual(result, data)
def test_read_exceeds_limit(self) -> None:
net_settings_t.reset(max_size=100)
data = b'x' * 200
import io
mock_resp = unittest.mock.MagicMock()
mock_resp.read = io.BytesIO(data).read
with self.assertRaises(ValueError) as ctx:
net_t._read_limited(mock_resp, max_size=100)
self.assertIn('max_size', str(ctx.exception))
class TestNetFetchUrl(unittest.TestCase):
def tearDown(self) -> None:
net_settings_t._instance = None
def test_fetch_url_uses_settings_timeout(self) -> None:
net_settings_t.reset(timeout=4.5)
with unittest.mock.patch('urllib.request.urlopen') as mock_urlopen:
mock_resp = unittest.mock.MagicMock()
mock_resp.read.side_effect = [b'hello', b'']
mock_urlopen.return_value = mock_resp
result = net_t.fetch_url('http://example.com')
self.assertEqual(result, b'hello')
mock_urlopen.assert_called_once_with('http://example.com', timeout=4.5)
def test_fetch_url_explicit_timeout_overrides(self) -> None:
net_settings_t.reset(timeout=8.0)
with unittest.mock.patch('urllib.request.urlopen') as mock_urlopen:
mock_resp = unittest.mock.MagicMock()
mock_resp.read.side_effect = [b'data', b'']
mock_urlopen.return_value = mock_resp
net_t.fetch_url('http://example.com', timeout=2.0)
mock_urlopen.assert_called_once_with('http://example.com', timeout=2.0)
def test_fetch_text_returns_str(self) -> None:
net_settings_t.reset(timeout=8.0)
with unittest.mock.patch('urllib.request.urlopen') as mock_urlopen:
mock_resp = unittest.mock.MagicMock()
mock_resp.read.side_effect = [b'hello world', b'']
mock_urlopen.return_value = mock_resp
result = net_t.fetch_text('http://example.com')
self.assertIsInstance(result, str)
self.assertEqual(result, 'hello world')
class TestNetCli(unittest.TestCase):
def tearDown(self) -> None:
net_settings_t._instance = None
def test_add_arguments(self) -> None:
parser = argparse.ArgumentParser()
net_cli_t.add_arguments(parser)
ns = parser.parse_args(['--net-timeout', '4.5', '--net-max-size', '1024'])
self.assertEqual(ns.net_timeout, 4.5)
self.assertEqual(ns.net_max_size, 1024)
def test_add_arguments_defaults(self) -> None:
parser = argparse.ArgumentParser()
net_cli_t.add_arguments(parser)
ns = parser.parse_args([])
self.assertIsNone(ns.net_timeout)
self.assertIsNone(ns.net_max_size)
def test_extract_both(self) -> None:
ns = argparse.Namespace(net_timeout=3.0, net_max_size=2048)
kwargs = net_cli_t.extract(ns)
self.assertEqual(kwargs, {'timeout': 3.0, 'max_size': 2048})
def test_extract_partial(self) -> None:
ns = argparse.Namespace(net_timeout=5.0, net_max_size=None)
kwargs = net_cli_t.extract(ns)
self.assertEqual(kwargs, {'timeout': 5.0})
def test_extract_empty(self) -> None:
ns = argparse.Namespace(net_timeout=None, net_max_size=None)
kwargs = net_cli_t.extract(ns)
self.assertEqual(kwargs, {})
def test_apply_with_overrides(self) -> None:
s = net_cli_t.apply({'timeout': 2.0, 'max_size': 512})
self.assertEqual(s.timeout, 2.0)
self.assertEqual(s.max_size, 512)
self.assertIs(s, net_settings_t.singleton())
def test_apply_empty_returns_default(self) -> None:
s = net_cli_t.apply({})
self.assertEqual(s.timeout, 8.0)
self.assertEqual(s.max_size, 50 * 1024 * 1024)