Skip to content

Commit a096c33

Browse files
committed
coverage-dump: Extract some common code to an llvm_junk submodule
1 parent fcdbb4a commit a096c33

File tree

6 files changed

+51
-48
lines changed

6 files changed

+51
-48
lines changed

src/tools/coverage-dump/src/covfun.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ use std::sync::OnceLock;
55
use anyhow::{Context, anyhow};
66
use regex::Regex;
77

8-
use crate::parser::{Parser, unescape_llvm_string_contents};
8+
use crate::llvm_junk::unescape_llvm_string_contents;
9+
use crate::parser::Parser;
910

1011
pub(crate) fn dump_covfun_mappings(
1112
llvm_ir: &str,
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use std::sync::OnceLock;
2+
3+
use regex::bytes;
4+
5+
#[cfg(test)]
6+
mod tests;
7+
8+
/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
9+
/// backslash escapes and returns a vector containing the resulting byte string.
10+
pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
11+
let escape_re = {
12+
static RE: OnceLock<bytes::Regex> = OnceLock::new();
13+
// LLVM IR supports two string escapes: `\\` and `\xx`.
14+
RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
15+
};
16+
17+
fn u8_from_hex_digits(digits: &[u8]) -> u8 {
18+
// We know that the input contains exactly 2 hex digits, so these calls
19+
// should never fail.
20+
assert_eq!(digits.len(), 2);
21+
let digits = std::str::from_utf8(digits).unwrap();
22+
u8::from_str_radix(digits, 16).unwrap()
23+
}
24+
25+
escape_re
26+
.replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
27+
let byte = match captures.get(1) {
28+
None => b'\\',
29+
Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
30+
};
31+
[byte]
32+
})
33+
.into_owned()
34+
}
35+
36+
/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
37+
/// 64 bits as a way to associate data stored in different tables/sections.
38+
pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 {
39+
use md5::{Digest, Md5};
40+
let mut hasher = Md5::new();
41+
hasher.update(bytes);
42+
let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
43+
// The truncated hash is explicitly little-endian, regardless of host
44+
// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
45+
u64::from_le_bytes(hash)
46+
}

src/tools/coverage-dump/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod covfun;
2+
mod llvm_junk;
23
mod parser;
34
mod prf_names;
45

src/tools/coverage-dump/src/parser.rs

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,4 @@
1-
#[cfg(test)]
2-
mod tests;
3-
4-
use std::sync::OnceLock;
5-
61
use anyhow::ensure;
7-
use regex::bytes;
8-
9-
/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
10-
/// backslash escapes and returns a vector containing the resulting byte string.
11-
pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
12-
let escape_re = {
13-
static RE: OnceLock<bytes::Regex> = OnceLock::new();
14-
// LLVM IR supports two string escapes: `\\` and `\xx`.
15-
RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
16-
};
17-
18-
fn u8_from_hex_digits(digits: &[u8]) -> u8 {
19-
// We know that the input contains exactly 2 hex digits, so these calls
20-
// should never fail.
21-
assert_eq!(digits.len(), 2);
22-
let digits = std::str::from_utf8(digits).unwrap();
23-
u8::from_str_radix(digits, 16).unwrap()
24-
}
25-
26-
escape_re
27-
.replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
28-
let byte = match captures.get(1) {
29-
None => b'\\',
30-
Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
31-
};
32-
[byte]
33-
})
34-
.into_owned()
35-
}
362

373
pub(crate) struct Parser<'a> {
384
rest: &'a [u8],

src/tools/coverage-dump/src/prf_names.rs

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ use std::sync::OnceLock;
44
use anyhow::{anyhow, ensure};
55
use regex::Regex;
66

7-
use crate::parser::{Parser, unescape_llvm_string_contents};
7+
use crate::llvm_junk::{truncated_md5, unescape_llvm_string_contents};
8+
use crate::parser::Parser;
89

910
/// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names`
1011
/// entries, decodes them, and creates a table that maps name hash values to
@@ -25,18 +26,6 @@ pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result<HashMap
2526
Some(payload)
2627
}
2728

28-
/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
29-
/// 64 bits as a way to associate data stored in different tables/sections.
30-
fn truncated_md5(bytes: &[u8]) -> u64 {
31-
use md5::{Digest, Md5};
32-
let mut hasher = Md5::new();
33-
hasher.update(bytes);
34-
let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
35-
// The truncated hash is explicitly little-endian, regardless of host
36-
// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
37-
u64::from_le_bytes(hash)
38-
}
39-
4029
fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result<String> {
4130
// In practice, raw symbol names should always be ASCII.
4231
let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?;

0 commit comments

Comments
 (0)