From 1980bbf8c003cb52bab0b1400aadeb031a4432c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eric=20M=C3=BCller?= <mueller@kip.uni-heidelberg.de>
Date: Mon, 13 Jan 2025 15:19:03 +0100
Subject: [PATCH] feat: add OCI cache helper tools

---
 fetch_cached_buildresults.py    | 128 ++++++++++++++++++++++++++++++++
 fetch_cached_sources.py         | 114 ++++++++++++++++++++++++++++
 specfile_dag_hash.py            |  42 +++++++++++
 specfile_storage_path_build.py  |  48 ++++++++++++
 specfile_storage_path_source.py |  62 ++++++++++++++++
 update_cached_buildresults.py   |  83 +++++++++++++++++++++
 update_cached_sources.py        |  69 +++++++++++++++++
 7 files changed, 546 insertions(+)
 create mode 100644 fetch_cached_buildresults.py
 create mode 100644 fetch_cached_sources.py
 create mode 100644 specfile_dag_hash.py
 create mode 100644 specfile_storage_path_build.py
 create mode 100644 specfile_storage_path_source.py
 create mode 100644 update_cached_buildresults.py
 create mode 100644 update_cached_sources.py

diff --git a/fetch_cached_buildresults.py b/fetch_cached_buildresults.py
new file mode 100644
index 00000000..99363fec
--- /dev/null
+++ b/fetch_cached_buildresults.py
@@ -0,0 +1,128 @@
+import argparse
+import os
+import pathlib
+import subprocess
+
+parser = argparse.ArgumentParser(
+        prog='fetch_cached_buildresults.py',
+        description='Downloading missing source files to a spack cache.',
+        epilog='...')
+
+parser.add_argument(
+    "path_missing", type=pathlib.Path,
+    help="Location of the output file that will list the packages not yet in the build cache.")
+
+parser.add_argument(
+    "specfiles", nargs="+",
+    help="Location of the file containing the specs to be available.")
+
+parser.add_argument(
+    "--remote-cache", type=str, required=(not (("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ))),
+    default=(("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ)) and
+            ("{}/{}/build_cache".format(os.environ["HARBOR_HOST"], os.environ["HARBOR_PROJECT"])) or "",
+    help="Path or URL to remote cache (target).")
+
+parser.add_argument(
+    "--remote-cache-type", type=str, choices=["oci"],
+    default="oci",
+    help="Type of the remote cache.")
+
+parser.add_argument(
+    "--remote-cache-username", type=str,
+    default="HARBOR_USERNAME" in os.environ and
+        pathlib.Path(os.environ["HARBOR_USERNAME"]) or "",
+    help="Username for remote cache (if applicable)")
+
+parser.add_argument(
+    "--remote-cache-password", type=str,
+    default="HARBOR_PASSWORD" in os.environ and
+        pathlib.Path(os.environ["HARBOR_PASSWORD"]) or None,
+    help="Password for remote cache (if applicable)")
+
+parser.add_argument(
+    "--local-cache", type=str,
+    default="YASHCHIKI_CACHE_BUILD" in os.environ and
+        pathlib.Path(os.environ["YASHCHIKI_CACHE_BUILD"]) or
+        os.path.expanduser("~/.yashchiki/cache/"),
+    help="Path to local spack cache folder (build results).")
+
+parser.add_argument(
+    "--yashchiki-home", type=str, required=True,
+    help="Path to yashchiki home for calling helper tools.")
+
+parser.add_argument(
+    "--include-installed",
+    action='store_true', default=False,
+    help="Include already installed specs.")
+
+args = parser.parse_args()
+
+local_cache = pathlib.Path(args.local_cache)
+if not os.path.exists(args.local_cache):
+    print("Creating local build cache directory")
+    local_cache.mkdir(parents=True, exist_ok=False)
+
+missing_packages = []
+available_packages = []
+cached_paths = []
+cmd = ["oras", "repo", "tags"]
+if args.remote_cache_username and args.remote_cache_password:
+    cmd.extend(["--username", args.remote_cache_username])
+    cmd.extend(["--password", args.remote_cache_password])
+cmd.append(args.remote_cache)
+try:
+    tags = subprocess.check_output(cmd)
+    tags = tags.decode("utf-8")
+    cached_paths = tags.split()
+except subprocess.CalledProcessError as e:
+    print(f"Listing repo tags of \"{args.remote_cache}\" failed.")
+
+for specfile in args.specfiles:
+    with open(specfile, "r") as fd:
+        fetch_paths = []
+        packages = {}
+        try:
+            include_installed = " --include-installed" if args.include_installed else ""
+            # FIXME: import and call function, but this would need *this to be run in spack-python already
+            lines = subprocess.check_output(f"spack-python {args.yashchiki_home}/specfile_storage_path_build.py {specfile}{include_installed}", shell=True)
+            lines = lines.decode("utf-8")
+            lines = lines.split("\n")
+            for line in lines:
+                if not line:
+                    continue
+                elems = line.split()
+                packages[elems[0]] = elems[1:]
+        except subprocess.CalledProcessError as e:
+            print(f"Computing fetch buildresult paths failed:", str(e), e.output)
+        for package_dag_hash, fetch_paths in packages.items():
+            missing_paths = []
+            for fetch_path in fetch_paths:
+                basename = os.path.basename(fetch_path)
+                if basename in cached_paths:
+                    cmd = ["oras", "pull"]
+                    if args.remote_cache_username and args.remote_cache_password:
+                        cmd.extend(["--username", args.remote_cache_username])
+                        cmd.extend(["--password", args.remote_cache_password])
+                    cmd.append(args.remote_cache + f":{basename}")
+                    try:
+                        subprocess.check_output(cmd, stderr=subprocess.STDOUT, cwd=local_cache)
+                    except subprocess.CalledProcessError as e:
+                        print(f"Pulling of \"{basename}\" from \"{args.remote_cache}\" failed.")
+                        missing_paths.append(fetch_path)
+                else:
+                    missing_paths.append(fetch_path)
+            package_missing = False
+            for missing_path in missing_paths:
+                if missing_path.endswith(".spack") or missing_path.endswith(".spec.json"):
+                    package_missing = True
+            if package_missing:
+                missing_packages.append(f"{package_dag_hash} " + " ".join(missing_paths))
+            else:
+                available_packages.append(f"{package_dag_hash} " + " ".join(missing_paths))
+
+print(len(missing_packages), "missing packages in remote buildresults cache.")
+print(len(available_packages), "available packages in remote buildresults cache.")
+
+if missing_packages:
+    with open(args.path_missing, "w") as fd:
+        fd.write("\n".join(missing_packages))
diff --git a/fetch_cached_sources.py b/fetch_cached_sources.py
new file mode 100644
index 00000000..6aace8b0
--- /dev/null
+++ b/fetch_cached_sources.py
@@ -0,0 +1,114 @@
+import argparse
+import os
+import pathlib
+import subprocess
+
+parser = argparse.ArgumentParser(
+        prog='fetch_cached_sources.py',
+        description='Downloading missing source files to a spack cache.',
+        epilog='...')
+
+parser.add_argument(
+    "path_missing", type=pathlib.Path,
+    help="Location of the output file that will list the packages not yet in the source cache.")
+
+parser.add_argument(
+    "specfiles", nargs="+",
+    help="Location of the file containing the specs to be available.")
+
+parser.add_argument(
+    "--remote-cache", type=str, required=(not (("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ))),
+    default=(("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ)) and
+            ("{}/{}/source_cache".format(os.environ["HARBOR_HOST"], os.environ["HARBOR_PROJECT"])) or "",
+    help="Path or URL to remote cache (target).")
+
+parser.add_argument(
+    "--remote-cache-type", type=str, choices=["oci"],
+    default="oci",
+    help="Type of the remote cache.")
+
+parser.add_argument(
+    "--remote-cache-username", type=str,
+    default="HARBOR_USERNAME" in os.environ and
+        pathlib.Path(os.environ["HARBOR_USERNAME"]) or "",
+    help="Username for remote cache (if applicable)")
+
+parser.add_argument(
+    "--remote-cache-password", type=str,
+    default="HARBOR_PASSWORD" in os.environ and
+        pathlib.Path(os.environ["HARBOR_PASSWORD"]) or None,
+    help="Password for remote cache (if applicable)")
+
+parser.add_argument(
+    "--local-cache", type=str,
+    default="YASHCHIKI_CACHE_SOURCE" in os.environ and
+        pathlib.Path(os.environ["YASHCHIKI_CACHE_SOURCE"]) or
+        os.path.expanduser("~/.yashchiki/cache/"),
+    help="Path to local spack cache folder (source).")
+
+parser.add_argument(
+    "--yashchiki-home", type=str, required=True,
+    help="Path to yashchiki home for calling helper tools.")
+
+parser.add_argument(
+    "--include-installed",
+    action='store_true', default=False,
+    help="Include already installed specs.")
+
+args = parser.parse_args()
+
+local_cache = pathlib.Path(args.local_cache)
+if not os.path.exists(args.local_cache):
+    print("Creating local cache directory")
+    local_cache.mkdir(parents=True, exist_ok=False)
+
+missing_paths = []
+available_paths = []
+cached_paths = []
+cmd = ["oras", "repo", "tags"]
+if args.remote_cache_username and args.remote_cache_password:
+    cmd.extend(["--username", args.remote_cache_username])
+    cmd.extend(["--password", args.remote_cache_password])
+cmd.append(args.remote_cache)
+try:
+    tags = subprocess.check_output(cmd)
+    tags = tags.decode("utf-8")
+    cached_paths = tags.split()
+except subprocess.CalledProcessError as e:
+    print(f"Listing repo tags of \"{args.remote_cache}\" failed.")
+
+for specfile in args.specfiles:
+    with open(specfile, "r") as fd:
+        fetch_paths = []
+        try:
+            include_installed = " --include-installed" if args.include_installed else ""
+            # FIXME: import and call function, but this would need *this to be run in spack-python already
+            paths = subprocess.check_output(f"spack-python {args.yashchiki_home}/specfile_storage_path_source.py {specfile}{include_installed}", shell=True)
+            paths = paths.decode("utf-8")
+            fetch_paths = paths.split()
+        except subprocess.CalledProcessError as e:
+            print(f"Computing fetch storage paths failed for {specfile}.")
+            continue
+        for fetch_path in fetch_paths:
+            basename = os.path.basename(fetch_path)
+            if basename in cached_paths:
+                cmd = ["oras", "pull"]
+                if args.remote_cache_username and args.remote_cache_password:
+                    cmd.extend(["--username", args.remote_cache_username])
+                    cmd.extend(["--password", args.remote_cache_password])
+                cmd.append(args.remote_cache + f":{basename}")
+                try:
+                    subprocess.check_output(cmd, stderr=subprocess.STDOUT, cwd=local_cache)
+                except subprocess.CalledProcessError as e:
+                    print(f"Pulling of \"{basename}\" from \"{args.remote_cache}\" failed.")
+                    missing_paths.append(fetch_path)
+                available_paths.append(fetch_path)
+            else:
+                missing_paths.append(fetch_path)
+
+print(len(missing_paths), "missing files in remote source cache.")
+print(len(available_paths), "available files in remote source cache.")
+
+if missing_paths:
+    with open(args.path_missing, "w") as fd:
+        fd.write("\n".join(missing_paths))
diff --git a/specfile_dag_hash.py b/specfile_dag_hash.py
new file mode 100644
index 00000000..6e001b84
--- /dev/null
+++ b/specfile_dag_hash.py
@@ -0,0 +1,42 @@
+import argparse
+from collections.abc import Iterable
+import pathlib
+import ruamel.yaml as yaml
+import spack
+import spack.binary_distribution as bindist
+
+parser = argparse.ArgumentParser(
+        prog='specfile_dag_hash.py',
+        description='Extracting DAG hashes from a given specfile',
+        epilog='...')
+
+parser.add_argument(
+    "path_specfile", type=pathlib.Path,
+    help="Location of the specfile to parse")
+
+parser.add_argument(
+    "--include-installed",
+    action='store_true', default=False,
+    help="Include already installed specs.")
+
+args = parser.parse_args()
+
+with open(args.path_specfile, "r") as fd:
+    file_content = fd.read()
+    data = list(yaml.safe_load_all(file_content))
+
+to_be_fetched = set()
+for rspec in data:
+    s = spack.spec.Spec.from_dict(rspec)
+    if not isinstance(s, Iterable):
+        s = [s]
+
+    maybe_to_be_fetched = spack.traverse.traverse_nodes(s, key=spack.traverse.by_dag_hash)
+
+    for spec in maybe_to_be_fetched:
+        if (not args.include_installed) and spec.installed:
+            continue
+        to_be_fetched.add(spec.dag_hash())
+
+for dag_hash in to_be_fetched:
+    print(dag_hash)
diff --git a/specfile_storage_path_build.py b/specfile_storage_path_build.py
new file mode 100644
index 00000000..15cb90f8
--- /dev/null
+++ b/specfile_storage_path_build.py
@@ -0,0 +1,48 @@
+import argparse
+from collections.abc import Iterable
+import pathlib
+import ruamel.yaml as yaml
+import spack
+import spack.binary_distribution as bindist
+
+parser = argparse.ArgumentParser(
+        prog='specfile_storage_path_build.py',
+        description='Extracting storage paths to the build cache from a given specfile',
+        epilog='...')
+
+parser.add_argument(
+    "path_specfile", type=pathlib.Path,
+    help="Location of the specfile to parse")
+
+parser.add_argument(
+    "--include-installed",
+    action='store_true', default=False,
+    help="Include already installed specs.")
+
+args = parser.parse_args()
+
+with open(args.path_specfile, "r") as fd:
+    file_content = fd.read()
+    data = list(yaml.safe_load_all(file_content))
+
+to_be_fetched = set()
+for rspec in data:
+    s = spack.spec.Spec.from_dict(rspec)
+    if not isinstance(s, Iterable):
+        s = [s]
+
+    maybe_to_be_fetched = spack.traverse.traverse_nodes(s, key=spack.traverse.by_dag_hash)
+
+    for spec in maybe_to_be_fetched:
+        if (not args.include_installed) and spec.installed:
+            continue
+        build_cache_paths = [
+            bindist.tarball_path_name(spec, ".spack"),
+            bindist.tarball_name(spec, ".spec.json.sig"),
+            bindist.tarball_name(spec, ".spec.json"),
+            bindist.tarball_name(spec, ".spec.yaml"),
+        ]
+        to_be_fetched.add(str(spec.dag_hash()) + " ".join(build_cache_paths))
+
+for elem in to_be_fetched:
+    print(elem)
diff --git a/specfile_storage_path_source.py b/specfile_storage_path_source.py
new file mode 100644
index 00000000..6e8a8889
--- /dev/null
+++ b/specfile_storage_path_source.py
@@ -0,0 +1,62 @@
+import argparse
+from collections.abc import Iterable
+import llnl.util.filesystem as fsys
+import os
+import pathlib
+import ruamel.yaml as yaml
+import spack
+import sys
+
+parser = argparse.ArgumentParser(
+        prog='specfile_storage_path_source.py',
+        description='Extracting storage paths to the source cache from a given specfile',
+        epilog='...')
+
+parser.add_argument(
+    "path_specfile", type=pathlib.Path,
+    help="Location of the specfile to parse")
+
+parser.add_argument(
+    "--include-installed",
+    action='store_true', default=False,
+    help="Include already installed specs.")
+
+args = parser.parse_args()
+
+with open(args.path_specfile, "r") as fd:
+    file_content = fd.read()
+    data = list(yaml.safe_load_all(file_content))
+
+to_be_fetched = set()
+for rspec in data:
+    s = spack.spec.Spec.from_dict(rspec)
+    if not isinstance(s, Iterable):
+        s = [s]
+
+    maybe_to_be_fetched = spack.traverse.traverse_nodes(s, key=spack.traverse.by_dag_hash)
+    
+    for ss in maybe_to_be_fetched:
+        if (not args.include_installed) and ss.installed:
+            continue
+
+        pkg = ss.package
+
+        # Some packages are not cachable (e.g. branch-name-only versions, or BundlePackages)
+        if not pkg.fetcher.cachable:
+            continue
+
+        # TODO: pkg.fetcher.mirror_id() might be almost sufficient…)
+
+        format_string = "{name}-{version}"
+        pretty_name = pkg.spec.format_path(format_string)
+        cosmetic_path = os.path.join(pkg.name, pretty_name)
+        to_be_fetched.add(str(spack.mirror.mirror_archive_paths(pkg.fetcher, cosmetic_path).storage_path))
+        for resource in pkg._get_needed_resources():
+            pretty_resource_name = fsys.polite_filename(f"{resource.name}-{pkg.version}")
+            to_be_fetched.add(str(spack.mirror.mirror_archive_paths(resource.fetcher, pretty_resource_name).storage_path))
+        for patch in ss.patches:
+            if isinstance(patch, spack.patch.UrlPatch):
+                to_be_fetched.add(str(spack.mirror.mirror_archive_paths(patch.stage.fetcher, patch.stage.name).storage_path))
+
+for elem in to_be_fetched:
+    print(elem)
diff --git a/update_cached_buildresults.py b/update_cached_buildresults.py
new file mode 100644
index 00000000..caacf86e
--- /dev/null
+++ b/update_cached_buildresults.py
@@ -0,0 +1,83 @@
+import argparse
+import glob
+import os
+import pathlib
+import subprocess
+import sys
+
+parser = argparse.ArgumentParser(
+        prog='update_cached_buildresults.py',
+        description='Uploading previously missing build results to a cache.',
+        epilog='...')
+
+parser.add_argument(
+    "path_missing", type=pathlib.Path,
+    help="Location of the file that lists the hashes and packages not yet in the build cache.")
+
+parser.add_argument(
+    "--remote-cache", type=str, required=(not (("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ))),
+    default=(("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ)) and
+            ("{}/{}/build_cache".format(os.environ["HARBOR_HOST"], os.environ["HARBOR_PROJECT"])) or "",
+    help="Path or URL to remote cache (target).")
+
+parser.add_argument(
+    "--remote-cache-type", type=str, choices=["oci"],
+    default="oci",
+    help="Type of the remote cache.")
+
+parser.add_argument(
+    "--remote-cache-username", type=str, required=(not "HARBOR_USERNAME" in os.environ),
+    default="HARBOR_USERNAME" in os.environ and
+        pathlib.Path(os.environ["HARBOR_USERNAME"]) or "",
+    help="Username for remote cache (if applicable)")
+
+parser.add_argument(
+    "--remote-cache-password", type=str, required=(not "HARBOR_PASSWORD" in os.environ),
+    default="HARBOR_PASSWORD" in os.environ and
+        pathlib.Path(os.environ["HARBOR_PASSWORD"]) or None,
+    help="Password for remote cache (if applicable)")
+
+parser.add_argument(
+    "--local-cache", type=str,
+    default="YASHCHIKI_CACHE_BUILD" in os.environ and
+        pathlib.Path(os.environ["YASHCHIKI_CACHE_BUILD"]) or
+        os.path.expanduser("~/.yashchiki/cache/"),
+    help="Path to local spack cache folder (build results).")
+
+args = parser.parse_args()
+
+if not os.path.exists(args.path_missing):
+    print("File w/ missing cached build information is not available: {}".format(args.path_missing))
+    sys.exit(0)
+
+packages = {}
+with open(args.path_missing, "r") as fd:
+    lines = fd.readlines()
+    for line in lines:
+        elems = line.split()
+        packages[elems[0]] = elems[1:]
+
+    for package_dag_hash, paths in packages.items():
+        basenames = [ os.path.basename(path) for path in paths]
+
+        for path, basename in zip(paths, basenames):
+            full_path = pathlib.Path(str(args.local_cache) + "/" + path)
+
+            if ((str(full_path).endswith(".spack") or str(full_path).endswith(".spec.json")) and not full_path.exists()):
+                print(f"Missing local cache entry for \"{full_path}\"")
+                continue
+
+            if not full_path.exists():
+                # we don't care about other file endings for now
+                continue
+
+            cmd = ("oras", "push",
+                    "--username", args.remote_cache_username,
+                    "--password", args.remote_cache_password,
+                    f"--annotation=path={path}",
+                    f"{args.remote_cache}:{basename}",
+                    f"{path}")
+            try:
+                subprocess.check_output(cmd, cwd=args.local_cache)
+            except subprocess.CalledProcessError as e:
+                print(f"Uploading of \"{path}\" to \"{args.remote_cache}:{basename}\" failed.")
diff --git a/update_cached_sources.py b/update_cached_sources.py
new file mode 100644
index 00000000..872b05c1
--- /dev/null
+++ b/update_cached_sources.py
@@ -0,0 +1,69 @@
+import argparse
+import os
+import pathlib
+import subprocess
+import sys
+
+parser = argparse.ArgumentParser(
+        prog='update_cached_sources.py',
+        description='Uploading previously missing source files to a cache.',
+        epilog='...')
+
+parser.add_argument(
+    "path_missing", type=pathlib.Path,
+    help="Location of the file that lists the hashes and packages not yet in the source cache.")
+
+parser.add_argument(
+    "--remote-cache", type=str, required=(not (("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ))),
+    default=(("HARBOR_HOST" in os.environ) and ("HARBOR_PROJECT" in os.environ)) and
+            ("{}/{}/source_cache".format(os.environ["HARBOR_HOST"], os.environ["HARBOR_PROJECT"])) or "",
+    help="Path or URL to remote cache (target).")
+
+parser.add_argument(
+    "--remote-cache-type", type=str, choices=["oci"],
+    default="oci",
+    help="Type of the remote cache.")
+
+parser.add_argument(
+    "--remote-cache-username", type=str, required=(not "HARBOR_USERNAME" in os.environ),
+    default="HARBOR_USERNAME" in os.environ and
+        pathlib.Path(os.environ["HARBOR_USERNAME"]) or "",
+    help="Username for remote cache (if applicable)")
+
+parser.add_argument(
+    "--remote-cache-password", type=str, required=(not "HARBOR_PASSWORD" in os.environ),
+    default="HARBOR_PASSWORD" in os.environ and
+        pathlib.Path(os.environ["HARBOR_PASSWORD"]) or None,
+    help="Password for remote cache (if applicable)")
+
+parser.add_argument(
+    "--local-cache", type=str,
+    default="YASHCHIKI_CACHE_SOURCE" in os.environ and
+        pathlib.Path(os.environ["YASHCHIKI_CACHE_SOURCE"]) or
+        os.path.expanduser("~/.yashchiki/cache/"),
+    help="Path to local spack cache folder (source).")
+
+args = parser.parse_args()
+
+if not os.path.exists(args.path_missing):
+    print("File w/ missing cached source information is not available: {}".format(args.path_missing))
+    sys.exit(0)
+
+with open(args.path_missing, "r") as fd:
+    missing_file_paths = fd.readlines()
+
+    for path in missing_file_paths:
+        stripped_path = path.rstrip()
+        basename = os.path.basename(stripped_path)
+        full_path = pathlib.Path(str(args.local_cache) + "/" + stripped_path)
+
+        cmd = ("oras", "push",
+                "--username", args.remote_cache_username,
+                "--password", args.remote_cache_password,
+                f"--annotation=path={stripped_path}",
+                f"{args.remote_cache}:{basename}",
+                f"{stripped_path}")
+        try:
+            subprocess.check_output(cmd, cwd=args.local_cache)
+        except subprocess.CalledProcessError as e:
+            print(f"Uploading of \"{stripped_path}\" to \"{args.remote_cache}:{basename}\" failed.")
-- 
GitLab