lol

prefetch-npm-deps: fix reproducibility

v1 lockfiles can contain multiple references to the same version of a
package, and these references can contain different `integrity` values,
such as one having SHA-1 and SHA-512, while another just has SHA-512.

Given that HashMap iteration order isn't defined, this causes
reproducibility issues, as a different integrity value could be chosen
each time.

Thanks to @lilyinstarlight for discovering this issue originally, as well
as the idea for the sorting-based implementation.

authored by winter.bsky.social and committed by

Lily Foster 7efebca8 d6b863fd

+151 -47
+1 -1
pkgs/build-support/node/fetch-npm-deps/src/main.rs
··· 105 105 eprintln!("{}", package.name); 106 106 107 107 let tarball = package.tarball()?; 108 - let integrity = package.integrity(); 108 + let integrity = package.integrity().map(ToString::to_string); 109 109 110 110 cache 111 111 .put(
+141 -7
pkgs/build-support/node/fetch-npm-deps/src/parse/lock.rs
··· 1 - use anyhow::{bail, Context}; 1 + use anyhow::{anyhow, bail, Context}; 2 2 use rayon::slice::ParallelSliceMut; 3 - use serde::Deserialize; 4 - use std::{collections::HashMap, fmt}; 3 + use serde::{ 4 + de::{self, Visitor}, 5 + Deserialize, Deserializer, 6 + }; 7 + use std::{ 8 + cmp::Ordering, 9 + collections::{HashMap, HashSet}, 10 + fmt, 11 + }; 5 12 use url::Url; 6 13 7 14 pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> { ··· 33 40 x.resolved 34 41 .partial_cmp(&y.resolved) 35 42 .expect("resolved should be comparable") 43 + .then( 44 + // v1 lockfiles can contain multiple references to the same version of a package, with 45 + // different integrity values (e.g. a SHA-1 and a SHA-512 in one, but just a SHA-512 in another) 46 + y.integrity 47 + .partial_cmp(&x.integrity) 48 + .expect("integrity should be comparable"), 49 + ) 36 50 }); 37 51 38 52 packages.dedup_by(|x, y| x.resolved == y.resolved); ··· 54 68 #[serde(default)] 55 69 bundled: bool, 56 70 resolved: Option<UrlOrString>, 57 - integrity: Option<String>, 71 + integrity: Option<HashCollection>, 58 72 dependencies: Option<HashMap<String, OldPackage>>, 59 73 } 60 74 ··· 63 77 #[serde(default)] 64 78 pub(super) name: Option<String>, 65 79 pub(super) resolved: Option<UrlOrString>, 66 - pub(super) integrity: Option<String>, 80 + pub(super) integrity: Option<HashCollection>, 67 81 } 68 82 69 83 #[derive(Debug, Deserialize, PartialEq, Eq, PartialOrd, Ord)] ··· 82 96 } 83 97 } 84 98 99 + #[derive(Debug, PartialEq, Eq)] 100 + pub(super) struct HashCollection(HashSet<Hash>); 101 + 102 + impl HashCollection { 103 + pub(super) fn into_best(self) -> Option<Hash> { 104 + self.0.into_iter().max() 105 + } 106 + } 107 + 108 + impl PartialOrd for HashCollection { 109 + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 110 + let lhs = self.0.iter().max()?; 111 + let rhs = other.0.iter().max()?; 112 + 113 + lhs.partial_cmp(rhs) 114 + } 115 + } 116 + 117 + impl<'de> Deserialize<'de> for HashCollection { 118 + fn deserialize<D>(deserializer: D) -> Result<HashCollection, D::Error> 119 + where 120 + D: Deserializer<'de>, 121 + { 122 + deserializer.deserialize_string(HashCollectionVisitor) 123 + } 124 + } 125 + 126 + struct HashCollectionVisitor; 127 + 128 + impl<'de> Visitor<'de> for HashCollectionVisitor { 129 + type Value = HashCollection; 130 + 131 + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 132 + formatter.write_str("a single SRI hash or a collection of them (separated by spaces)") 133 + } 134 + 135 + fn visit_str<E>(self, value: &str) -> Result<HashCollection, E> 136 + where 137 + E: de::Error, 138 + { 139 + let hashes = value 140 + .split_ascii_whitespace() 141 + .map(Hash::new) 142 + .collect::<anyhow::Result<_>>() 143 + .map_err(E::custom)?; 144 + 145 + Ok(HashCollection(hashes)) 146 + } 147 + } 148 + 149 + #[derive(Debug, Deserialize, PartialEq, Eq, Hash)] 150 + pub struct Hash(String); 151 + 152 + // Hash algorithms, in ascending preference. 153 + const ALGOS: &[&str] = &["sha1", "sha512"]; 154 + 155 + impl Hash { 156 + fn new(s: impl AsRef<str>) -> anyhow::Result<Hash> { 157 + let algo = s 158 + .as_ref() 159 + .split_once('-') 160 + .ok_or_else(|| anyhow!("expected SRI hash, got {:?}", s.as_ref()))? 161 + .0; 162 + 163 + if ALGOS.iter().any(|&a| algo == a) { 164 + Ok(Hash(s.as_ref().to_string())) 165 + } else { 166 + Err(anyhow!("unknown hash algorithm {algo:?}")) 167 + } 168 + } 169 + } 170 + 171 + impl fmt::Display for Hash { 172 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 173 + self.0.fmt(f) 174 + } 175 + } 176 + 177 + impl PartialOrd for Hash { 178 + fn partial_cmp(&self, other: &Hash) -> Option<Ordering> { 179 + let lhs = self.0.split_once('-')?.0; 180 + let rhs = other.0.split_once('-')?.0; 181 + 182 + ALGOS 183 + .iter() 184 + .position(|&s| lhs == s)? 185 + .partial_cmp(&ALGOS.iter().position(|&s| rhs == s)?) 186 + } 187 + } 188 + 189 + impl Ord for Hash { 190 + fn cmp(&self, other: &Hash) -> Ordering { 191 + self.partial_cmp(other).unwrap() 192 + } 193 + } 194 + 85 195 #[allow(clippy::case_sensitive_file_extension_comparisons)] 86 196 fn to_new_packages( 87 197 old_packages: HashMap<String, OldPackage>, ··· 149 259 150 260 #[cfg(test)] 151 261 mod tests { 152 - use super::{get_initial_url, to_new_packages, OldPackage, Package, UrlOrString}; 153 - use std::collections::HashMap; 262 + use super::{ 263 + get_initial_url, to_new_packages, Hash, HashCollection, OldPackage, Package, UrlOrString, 264 + }; 265 + use std::{ 266 + cmp::Ordering, 267 + collections::{HashMap, HashSet}, 268 + }; 154 269 use url::Url; 155 270 156 271 #[test] ··· 187 302 }); 188 303 189 304 Ok(()) 305 + } 306 + 307 + #[test] 308 + fn hash_preference() { 309 + assert_eq!( 310 + Hash(String::from("sha1-foo")).partial_cmp(&Hash(String::from("sha512-foo"))), 311 + Some(Ordering::Less) 312 + ); 313 + 314 + assert_eq!( 315 + HashCollection({ 316 + let mut set = HashSet::new(); 317 + set.insert(Hash(String::from("sha512-foo"))); 318 + set.insert(Hash(String::from("sha1-bar"))); 319 + set 320 + }) 321 + .into_best(), 322 + Some(Hash(String::from("sha512-foo"))) 323 + ); 190 324 } 191 325 }
+9 -39
pkgs/build-support/node/fetch-npm-deps/src/parse/mod.rs
··· 87 87 88 88 #[derive(Debug)] 89 89 enum Specifics { 90 - Registry { integrity: String }, 90 + Registry { integrity: lock::Hash }, 91 91 Git { workdir: TempDir }, 92 92 } 93 93 ··· 134 134 Specifics::Git { workdir } 135 135 } 136 136 None => Specifics::Registry { 137 - integrity: get_ideal_hash( 138 - &pkg.integrity 139 - .expect("non-git dependencies should have assosciated integrity"), 140 - )? 141 - .to_string(), 137 + integrity: pkg 138 + .integrity 139 + .expect("non-git dependencies should have assosciated integrity") 140 + .into_best() 141 + .expect("non-git dependencies should have non-empty assosciated integrity"), 142 142 }, 143 143 }; 144 144 ··· 181 181 } 182 182 } 183 183 184 - pub fn integrity(&self) -> Option<String> { 184 + pub fn integrity(&self) -> Option<&lock::Hash> { 185 185 match &self.specifics { 186 - Specifics::Registry { integrity } => Some(integrity.clone()), 186 + Specifics::Registry { integrity } => Some(integrity), 187 187 Specifics::Git { .. } => None, 188 188 } 189 189 } ··· 304 304 } 305 305 } 306 306 307 - fn get_ideal_hash(integrity: &str) -> anyhow::Result<&str> { 308 - let split: Vec<_> = integrity.split_ascii_whitespace().collect(); 309 - 310 - if split.len() == 1 { 311 - Ok(split[0]) 312 - } else { 313 - for hash in ["sha512-", "sha1-"] { 314 - if let Some(h) = split.iter().find(|s| s.starts_with(hash)) { 315 - return Ok(h); 316 - } 317 - } 318 - 319 - Err(anyhow!("not sure which hash to select out of {split:?}")) 320 - } 321 - } 322 - 323 307 #[cfg(test)] 324 308 mod tests { 325 - use super::{get_hosted_git_url, get_ideal_hash}; 309 + use super::get_hosted_git_url; 326 310 use url::Url; 327 311 328 312 #[test] ··· 352 336 .is_err(), 353 337 "GitLab URLs should be marked as invalid (lol)" 354 338 ); 355 - } 356 - 357 - #[test] 358 - fn ideal_hashes() { 359 - for (input, expected) in [ 360 - ("sha512-foo sha1-bar", Some("sha512-foo")), 361 - ("sha1-bar md5-foo", Some("sha1-bar")), 362 - ("sha1-bar", Some("sha1-bar")), 363 - ("sha512-foo", Some("sha512-foo")), 364 - ("foo-bar sha1-bar", Some("sha1-bar")), 365 - ("foo-bar baz-foo", None), 366 - ] { 367 - assert_eq!(get_ideal_hash(input).ok(), expected); 368 - } 369 339 } 370 340 }