native-api 94faa168cb
CI: adjust CPython PR generation logic (#3392)
* For files retrieved through the GNU mirror load balancer, specify the load balancer URL
* Explicitly `git mv` files on prerelease upgrade in case Git doesn't detect it
* Add a trailing newline to a thunk
2026-01-14 07:03:58 +03:00

625 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
"""Script to add CPython releases.
Checks the CPython download archives for new versions,
then writes a build script for any which do not exist locally,
saving it to plugins/python-build/share/python-build.
"""
import argparse
import dataclasses
import hashlib
import io
import itertools
import logging
import operator
import os.path
import pathlib
import pprint
import re
import subprocess
import sys
import typing
import urllib.parse
import jc
import more_itertools
import packaging.version
import requests
import requests_html
import sortedcontainers
import tqdm
logger = logging.getLogger(__name__)
CUTOFF_VERSION=packaging.version.Version('3.9')
EXCLUDED_VERSIONS= {
packaging.version.Version("3.9.3") #recalled upstream
}
here = pathlib.Path(__file__).resolve()
OUT_DIR: pathlib.Path = here.parent.parent / "share" / "python-build"
T_THUNK=\
'''export PYTHON_BUILD_FREE_THREADING=1
source "${BASH_SOURCE[0]%t}"
'''
def adapt_script(version: packaging.version.Version,
previous_version: packaging.version.Version) -> typing.Union[pathlib.Path, None]:
previous_version_path = OUT_DIR.joinpath(str(previous_version))
with previous_version_path.open("r", encoding='utf-8') as f:
script = f.readlines()
result = io.StringIO()
for line in script:
if m:=re.match(r'\s*install_package\s+"(?P<package>Python-\S+)"\s+'
r'"(?P<url>\S+)"\s+.*\s+verify_py(?P<verify_py_suffix>\d+)\s+.*$',
line):
existing_url_path = urllib.parse.urlparse(m.group('url')).path
try:
matched_download = more_itertools.one(
item for item in VersionDirectory.available[version].downloads
if existing_url_path.endswith(item.extension))
except ValueError:
logger.error(f'Cannot match existing URL path\'s {existing_url_path} extension '
f'to available downloads {VersionDirectory.available[version].downloads}')
return
new_package_name, new_package_url = matched_download.package_name, matched_download.url
new_package_hash = Url.sha256_url(new_package_url, VersionDirectory.session)
verify_py_suffix = str(version.major)+str(version.minor)
line = Re.sub_groups(m,
package=new_package_name,
url=new_package_url+'#'+new_package_hash,
verify_py_suffix=verify_py_suffix)
elif m:=re.match(r'\s*install_package\s+"(?P<package>openssl-\S+)"\s+'
r'"(?P<url>\S+)"\s.*$',
line):
item = VersionDirectory.openssl.get_store_latest_release()
line = Re.sub_groups(m,
package=item.package_name,
url=item.url + '#' + item.hash)
elif m:=re.match(r'\s*install_package\s+"(?P<package>readline-\S+)"\s+'
r'"(?P<url>\S+)"\s.*$',
line):
item = VersionDirectory.readline.get_store_latest_release()
line = Re.sub_groups(m,
package=item.package_name,
url=item.url + '#' + item.hash)
result.write(line)
result_path = OUT_DIR.joinpath(str(version))
logger.info(f"Writing {result_path}")
result_path.write_text(result.getvalue(), encoding='utf-8')
result.close()
return result_path
def add_version(version: packaging.version.Version):
previous_version = VersionDirectory.existing.pick_previous_version(version).version
is_prerelease_upgrade = previous_version.major==version.major\
and previous_version.minor==version.minor\
and previous_version.micro==version.micro
logger.info(f"Adding {version} based on {previous_version}"
+ (" (prerelease upgrade)" if is_prerelease_upgrade else ""))
VersionDirectory.available.get_store_available_source_downloads(version)
new_path = adapt_script(version,
previous_version)
if not new_path:
return False
VersionDirectory.existing.append(_CPythonExistingScriptInfo(version,str(new_path)))
cleanup_prerelease_upgrade(is_prerelease_upgrade, previous_version, version)
handle_t_thunks(version, previous_version, is_prerelease_upgrade)
print(version)
return True
def cleanup_prerelease_upgrade(
is_prerelease_upgrade: bool,
previous_version: packaging.version.Version,
new_version: packaging.version.Version)\
-> None:
if not is_prerelease_upgrade:
return
previous_version_filename = str(previous_version)
new_version_filename = str(new_version)
new_version_path = OUT_DIR / new_version_filename
logger.info(f'Git moving {previous_version_filename} '
f'to {new_version_filename} (preserving new data)')
data = new_version_path.read_text()
new_version_path.unlink()
subprocess.check_call(("git","-C",OUT_DIR,
"mv",
previous_version_filename,
new_version_filename))
new_version_path.write_text(data)
del VersionDirectory.existing[previous_version]
def handle_t_thunks(version, previous_version, is_prerelease_upgrade):
if (version.major, version.minor) < (3, 13):
return
# an old thunk may have older version-specific code
# so it's safer to write a known version-independent template
thunk_name = (str(version) + "t")
thunk_path = OUT_DIR / thunk_name
previous_thunk_name = str(previous_version) + "t"
previous_thunk_path = OUT_DIR / previous_thunk_name
if is_prerelease_upgrade:
logger.info(f"Git moving {previous_thunk_name} to {thunk_name}")
subprocess.check_call(("git","-C",OUT_DIR,
"mv",
previous_thunk_name,
thunk_name))
else:
logger.info(f"Deleting {previous_thunk_path}")
previous_thunk_path.unlink()
logger.info(f"Writing {thunk_path}")
thunk_path.write_text(T_THUNK, encoding='utf-8')
Arguments: argparse.Namespace
def main():
global Arguments
Arguments = parse_args()
logging.basicConfig(level=logging.DEBUG if Arguments.verbose else logging.INFO)
cached_session=requests_html.HTMLSession()
global VersionDirectory
VersionDirectory = _VersionDirectory(cached_session)
VersionDirectory.existing.populate()
VersionDirectory.available.populate()
for initial_release in (v for v in frozenset(VersionDirectory.available.keys())
if v.micro == 0 and v not in VersionDirectory.existing):
# may actually be a prerelease
VersionDirectory.available.get_store_available_source_downloads(initial_release, True)
del initial_release
versions_to_add = sorted(VersionDirectory.available.keys() - VersionDirectory.existing.keys())
logger.info("Versions to add:\n"+pprint.pformat(versions_to_add))
result = False
for version_to_add in versions_to_add:
result = add_version(version_to_add) or result
return int(not result)
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"-d", "--dry-run", action="store_true",
help="Do not write scripts, just report them to stdout",
)
parser.add_argument(
"-v", "--verbose", action="store_true", default=0,
help="Increase verbosity of logging",
)
parsed = parser.parse_args()
return parsed
T = typing.TypeVar('T', bound=object)
K = typing.TypeVar('K', bound=typing.Hashable)
class KeyedList(typing.List[T], typing.Mapping[K, T]):
key_field: str
item_init: typing.Callable[..., T] = None
def __init__(self, seq: typing.Union[typing.Iterable[T], None] = None):
super().__init__()
self._map = {}
if seq is not None:
self.__iadd__(seq)
# read
def __getitem__(self, key: K) -> T:
return self._map[key]
def __contains__(self, key: K):
return key in self._map
def keys(self) -> typing.AbstractSet[K]:
return self._map.keys()
# write
def append(self, item: T) -> None:
key = self._getkey(item)
if key in self:
raise ValueError(f"Key '{key:r}' already present")
super().append(item)
self._map[key] = item
def __iadd__(self, other: typing.Iterable[T]):
for item in other:
self.append(item)
return self
def __delitem__(self, key: K):
super().remove(self[key])
del self._map[key]
def clear(self):
super().__delitem__(slice(None,None))
self._map.clear()
# read-write
def get_or_create(self, key: K, **kwargs):
try:
return self[key]
except KeyError as e:
if self.item_init is None:
raise AttributeError("'item_init' must be set to use automatic item creation") from e
kwargs[self.key_field] = key
item = self.item_init(**kwargs)
self.append(item)
return item
# info
def __repr__(self):
return self.__class__.__name__ + "([" + ", ".join(repr(i) for i in self) + "])"
# private
def _getkey(self, item: T) -> K:
return getattr(item, self.key_field)
del T, K
@dataclasses.dataclass(frozen=True)
class _CPythonAvailableVersionDownloadInfo:
extension: str
package_name: str
url: str
class _CPythonAvailableVersionDownloadsDirectory(KeyedList[_CPythonAvailableVersionDownloadInfo, str]):
key_field = "extension"
@dataclasses.dataclass(frozen=True)
class _CPythonAvailableVersionInfo:
version: packaging.version.Version
download_page_url: str
downloads: _CPythonAvailableVersionDownloadsDirectory = dataclasses.field(
default_factory=lambda:_CPythonAvailableVersionDownloadsDirectory()
)
class CPythonAvailableVersionsDirectory(KeyedList[_CPythonAvailableVersionInfo, packaging.version.Version]):
key_field = "version"
_session: requests.Session
item_init = _CPythonAvailableVersionInfo
def __init__(self, session: requests.Session, seq=None):
super().__init__(seq)
self._session = session
def populate(self):
"""
Fetch remote versions
"""
logger.info("Fetching available CPython versions")
for name, url in DownloadPage.enum_download_entries(
"https://www.python.org/ftp/python/",
r'^(\d+.*)/$', self._session,
make_name= lambda m: m.group(1)
):
v = packaging.version.Version(name)
if v < CUTOFF_VERSION or v in EXCLUDED_VERSIONS:
continue
logger.debug(f'Available version: {name} ({v}), {url}')
self.append(_CPythonAvailableVersionInfo(
v,
url
))
def get_store_available_source_downloads(self, version, refine_mode=False):
entry = self[version]
if entry.downloads:
#already retrieved
return
additional_versions_found =\
CPythonAvailableVersionsDirectory(self._session) if refine_mode else None
exact_download_found = False
for name, url in DownloadPage.enum_download_entries(
entry.download_page_url,
r'Python-.*\.(tar\.xz|tgz)$',
self._session):
m = re.match(r'(?P<package>Python-(?P<version>.*))\.(?P<extension>tar\.xz|tgz)$', name)
download_version = packaging.version.Version(m.group("version"))
if download_version != version:
if not refine_mode:
raise ValueError(f"Unexpectedly found a download {name} for {download_version} "
f"at page {entry.download_page_url} for {version}")
entry_to_fill = additional_versions_found.get_or_create(
download_version,
download_page_url=entry.download_page_url
)
else:
exact_download_found = True
entry_to_fill = entry
entry_to_fill.downloads.append(_CPythonAvailableVersionDownloadInfo(
m.group("extension"), m.group('package'), url
))
if not exact_download_found:
actual_version = max(additional_versions_found.keys())
logger.debug(f"Refining available version {version} to {actual_version}")
del self[version]
self.append(
additional_versions_found[
actual_version
])
class _CPythonExistingScriptInfo(typing.NamedTuple):
version: packaging.version.Version
filename: str
class CPythonExistingScriptsDirectory(KeyedList[_CPythonExistingScriptInfo, packaging.version.Version]):
key_field = "version"
_filename_pattern = r'^\d+\.\d+(?:(t?)(-\w+)|(.\d+((?:a|b|rc)\d)?(t?)))$'
def populate(self):
"""
Enumerate existing installation scripts in share/python-build/ by pattern
"""
logger.info(f"Enumerating existing versions in {OUT_DIR}")
for entry_name in (p.name for p in OUT_DIR.iterdir() if p.is_file()):
if (not (m := re.match(self._filename_pattern, entry_name))
or m.group(1) == 't' or m.group(5) == 't'):
continue
try:
v = packaging.version.Version(entry_name)
if v < CUTOFF_VERSION:
continue
# branch tip scrpts are different from release scripts and thus unusable as a pattern
if v.dev is not None:
continue
logger.debug(f"Existing version {v}")
self.append(_CPythonExistingScriptInfo(v, entry_name))
except ValueError as e:
logger.error(f"Unable to parse existing version {entry_name}: {e}")
def pick_previous_version(self,
version: packaging.version.Version) \
-> _CPythonExistingScriptInfo:
return max(v for v in self if v.version < version)
class _OpenSSLVersionInfo(typing.NamedTuple):
version: packaging.version.Version
package_name: str
url: str
hash: str
class OpenSSLVersionsDirectory(KeyedList[_OpenSSLVersionInfo, packaging.version.Version]):
key_field = "version"
def get_store_latest_release(self) \
-> _OpenSSLVersionInfo:
if self:
#already retrieved
return self[max(self.keys())]
j = requests.get("https://api.github.com/repos/openssl/openssl/releases/latest").json()
# noinspection PyTypeChecker
# urlparse can parse str as well as bytes
shasum_url = more_itertools.one(
asset['browser_download_url']
for asset in j['assets']
if urllib.parse.urlparse(asset['browser_download_url']).path.split('/')[-1].endswith('.sha256')
)
shasum_text = requests.get(shasum_url).text
shasum_data = jc.parse("hashsum", shasum_text, quiet=True)[0]
package_hash, package_filename = shasum_data["hash"], shasum_data["filename"]
del shasum_data, shasum_text, shasum_url
# OpenSSL Github repo has tag names "openssl-<version>" as of this writing like we need
# but let's not rely on that
# splitext doesn't work with a chained extension, it only splits off the last one
package_name, package_version_str = re.match(r"([^-]+-(.*?))\.\D", package_filename).groups()
package_version = packaging.version.Version(package_version_str)
package_url = more_itertools.one(
asset['browser_download_url']
for asset in j['assets']
if urllib.parse.urlparse(asset['browser_download_url']).path.split('/')[-1] == package_filename
)
result = _OpenSSLVersionInfo(package_version, package_name, package_url, package_hash)
self.append(result)
return result
class _ReadlineVersionInfo(typing.NamedTuple):
version : packaging.version.Version
package_name : str
url : str
hash : str
class ReadlineVersionsDirectory(KeyedList[_ReadlineVersionInfo, packaging.version.Version]):
key_field = "version"
def get_store_latest_release(self):
if not self:
self._store_latest_release()
return self._latest_release()
def _store_latest_release(self):
candidates = ReadlineVersionsDirectory()
pattern = r'(?P<package_name>readline-(?P<version>\d+(?:\.\d+)+)).tar\.gz$'
for name, url in DownloadPage.enum_download_entries(
'https://ftpmirror.gnu.org/readline/', pattern, VersionDirectory.session):
m = re.match(pattern, name)
version = packaging.version.Version(m.group('version'))
candidates.append(_ReadlineVersionInfo(
version,
m.group('package_name'),
url,
""
))
max_item = candidates._latest_release()
hash_ = Url.sha256_url(max_item.url, VersionDirectory.session)
permalink = 'https://ftpmirror.gnu.org/readline/' +\
os.path.basename(urllib.parse.urlparse(max_item.url).path)
result = _ReadlineVersionInfo(
max_item.version,
max_item.package_name,
permalink,
hash_)
self.append(result)
return result
def _latest_release(self):
return self[max(self.keys())]
class _VersionDirectory:
def __init__(self, session):
self.existing = CPythonExistingScriptsDirectory()
self.available = CPythonAvailableVersionsDirectory(session)
self.openssl = OpenSSLVersionsDirectory()
self.readline = ReadlineVersionsDirectory()
self.session = session
VersionDirectory : _VersionDirectory
class DownloadPage:
class _DownloadPageEntry(typing.NamedTuple):
name: str
url: str
@classmethod
def enum_download_entries(cls, url, pattern, session=None,
make_name = lambda m: m.string ) \
-> typing.Generator[_DownloadPageEntry, None, None]:
"""
Enum download entries in a standard Apache directory page
(incl. CPython download page https://www.python.org/ftp/python/)
or a GNU mirror directory page
(https://ftpmirror.gnu.org/<package>/ destinations)
"""
if session is None:
session = requests_html.HTMLSession()
response = session.get(url)
page = response.html
table = page.find("pre", first=True)
# some GNU mirrors format entries as a table
# (e.g. https://mirrors.ibiblio.org/gnu/readline/)
if table is None:
table = page.find("table", first=True)
links = table.find("a")
for link in links:
href = link.attrs['href']
# CPython entries are directories
name = link.text
# skip directory entries
if not (m:=re.match(pattern, name)):
continue
name = make_name(m)
yield cls._DownloadPageEntry(name, urllib.parse.urljoin(response.url, href))
class Re:
@dataclasses.dataclass
class _interval:
group: typing.Union[int, str, None]
start: int
end: int
@staticmethod
def sub_groups(match: re.Match,
/, *args: [typing.AnyStr],
**kwargs: [typing.AnyStr])\
-> typing.AnyStr:
repls={i:repl for i,repl in enumerate(args) if repl is not None}
repls.update({n:repl for n,repl in kwargs.items() if repl is not None})
intervals: sortedcontainers.SortedList[Re._interval]=\
sortedcontainers.SortedKeyList(key=operator.attrgetter("start","end"))
for group_id in itertools.chain(range(1,len(match.groups())), match.groupdict().keys()):
if group_id not in repls:
continue
if match.start(group_id) == -1:
continue
intervals.add(Re._interval(group_id,match.start(group_id),match.end(group_id)))
del group_id
last_interval=Re._interval(None,0,0)
result=""
for interval in intervals:
if interval.start < last_interval.end:
raise ValueError(f"Cannot replace intersecting matches "
f"for groups {last_interval.group} and {interval.group} "
f"(position {interval.start})")
if interval.end == interval.start and \
last_interval.start == last_interval.end == interval.start:
raise ValueError(f"Cannot replace consecutive zero-length matches "
f"for groups {last_interval.group} and {interval.group} "
f"(position {interval.start})")
result+=match.string[last_interval.end:interval.start]+repls[interval.group]
last_interval = interval
result+=match.string[last_interval.end:]
return result
class Url:
@staticmethod
def sha256_url(url, session=None):
if session is None:
session = requests_html.HTMLSession()
logger.info(f"Downloading and computing hash of {url}")
h=hashlib.sha256()
r=session.get(url,stream=True)
total_bytes=int(r.headers.get('content-length',0)) or float('inf')
with tqdm.tqdm(total=total_bytes, unit='B', unit_scale=True, unit_divisor=1024) as t:
for c in r.iter_content(1024):
t.update(len(c))
h.update(c)
return h.hexdigest()
if __name__ == "__main__":
sys.exit(main())