- ?p>
捎?CoverM 存储库更新, 记录的代码可能与最新代码稍有不同
Contig
分析 mean 方法
- contig 方法的入口位于 src/bin/coverm::main 的 “contig” 分支中, 忽略帮助等参数, 假设已有 bam 文件, 不设置 reads 的额外
filter_params , 则函数主体简写如下:fn main() { let mut app = build_cli(); let matches = app.clone().get_matches(); set_log_level(&matches, false); let mut print_stream; match matches.subcommand_name() { Some("contig") => { let m = matches.subcommand_matches("contig").unwrap(); let print_zeros = true; // Add metabat filtering params since we are running contig let filter_params = FilterParameters::generate_from_clap(m); let threads = *m.get_one::<u16>("threads").unwrap(); print_stream = OutputWriter::generate(m.get_one::<String>("output-file").map(|x| &**x)); let mut estimators_and_taker = EstimatorsAndTaker::generate_from_clap(m, print_stream.clone()); estimators_and_taker = estimators_and_taker.print_headers("Contig", print_stream.clone()); if m.contains_id("bam-files") { let bam_files: Vec<&str> = m .get_many::<String>("bam-files") .unwrap() .map(|x| &**x) .collect(); if filter_params.doing_filtering() { unreachable!() } else if m.get_flag("sharded") { unreachable!() } else { let bam_readers = coverm::bam_generator::generate_named_bam_readers_from_bam_files(bam_files); run_contig( &mut estimators_and_taker, bam_readers, print_zeros, filter_params.flag_filters, threads, &mut print_stream, ); } } else { unreachable!() } } _ => { app.print_help().unwrap(); println!(); } } }
filter_params = FilterParameters::generate_from_clap(m); 获得了对 bam 文件过滤的参数, 这些参数的获取在 “genome”, “filter”, 和 “contig” 三个分支中是相同的, 在这里先忽略let bam_readers = coverm::bam_generator::generate_named_bam_readers_from_bam_files(bam_files); 将 bam 文件名转换为结构化的对象- 随后通过
run_contig 进行读取
run_contig
- run_contig 如下:
fn run_contig< R: coverm::bam_generator::NamedBamReader, T: coverm::bam_generator::NamedBamReaderGenerator<R>, >( estimators_and_taker: &mut EstimatorsAndTaker, bam_readers: Vec<T>, print_zeros: bool, flag_filters: FlagFilter, threads: u16, print_stream: &mut OutputWriter, ) { let reads_mapped = coverm::contig::contig_coverage( bam_readers, &mut estimators_and_taker.taker, &mut estimators_and_taker.estimators, print_zeros, &flag_filters, threads, ); debug!("Finalising printing .."); estimators_and_taker.printer.finalise_printing( &estimators_and_taker.taker, print_stream, Some(&reads_mapped), &estimators_and_taker.columns_to_normalise, estimators_and_taker.rpkm_column, estimators_and_taker.tpm_column, ); }
- 其首先使用
coverm::contig::contig_coverage 计算 coverage, 随后再通过estimators_and_taker.printer.finalise_printing 输出
- 其首先使用
contig_coverage
- contig_coverage 的主体框架如下:
pub fn contig_coverage<R: NamedBamReader, G: NamedBamReaderGenerator<R>, T: CoverageTaker>( bam_readers: Vec<G>, coverage_taker: &mut T, coverage_estimators: &mut Vec<CoverageEstimator>, print_zero_coverage_contigs: bool, flag_filters: &FlagFilter, threads: u16, ) -> Vec<ReadsMapped> { let mut reads_mapped_vector = vec![]; for bam_generator in bam_readers { let mut bam_generated = bam_generator.start(); 中间步骤 let reads_mapped = ReadsMapped { num_mapped_reads: num_mapped_reads_total, num_reads: bam_generated.num_detected_primary_alignments(), }; reads_mapped_vector.push(reads_mapped); if bam_generated.num_detected_primary_alignments() == 0 { warn!( "No primary alignments were observed for sample {} - perhaps something went wrong in the mapping?", stoit_name ); } bam_generated.finish(); } reads_mapped_vector }
- 可见, coverm 逐个处理 bam 文件, 记录每个 bam 的
num_mapped_reads 和 reads 总数 (bam_generated.num_detected_primary_alignments() , 仅不考虑is_secondary 和is_supplementary 情况, 但是允许 unmapped), 随后返回这个列表
- 可见, coverm 逐个处理 bam 文件, 记录每个 bam 的
- 在这个函数的中间步骤中首先初始化参数, 随后进入一个巨大的 loop 循环
// for record in records loop { match bam_generated.read(&mut record) { None => { break; } Some(Ok(())) => {} Some(e) => { panic!("Error reading BAM record: {:?}", e) } }
- 在这个 loop 循环中, 首先读取 reads 比对 (不考虑 paired)
- 仅处理质量足够好的 reads:
if !flag_filters.passes(&record) { trace!("Skipping read based on flag filtering"); continue; } let tid = record.tid(); // if mapped if !record.is_unmapped() {
- 假设 bam 文件已经按 contig 完成排序 (也就是说, 所有比对到同一个 reference 上的 reads 都在一起), 那么需要报告所有上一条 reads 的结果:
// if reference has changed, print the last record if tid != last_tid { if tid < last_tid { error!("BAM file appears to be unsorted. Input BAM files must be sorted by reference (i.e. by samtools sort)"); panic!("BAM file appears to be unsorted. Input BAM files must be sorted by reference (i.e. by samtools sort)"); } process_previous_contigs( last_tid, tid, coverage_estimators, &ups_and_downs, num_mapped_reads_in_current_contig, total_edit_distance_in_current_contig, total_indels_in_current_contig, &mut num_mapped_reads_total, );
- 由于这种结构, 可见 bam 中最后一个 contig 的比对序列结束后循环也结束了, 因此在循环结束后还需进行一次
process_previous_contigs - 随后重置参数, 以处理 (当前 reads mapping 的) 下一条 contig 的结果
ups_and_downs = vec![0; header.target_len(tid as u32).expect("Corrupt BAM file?") as usize]; debug!( "Working on new reference {}", std::str::from_utf8(target_names[tid as usize]).unwrap() ); last_tid = tid; num_mapped_reads_in_current_contig = 0; total_edit_distance_in_current_contig = 0; total_indels_in_current_contig = 0;
- 由于这种结构, 可见 bam 中最后一个 contig 的比对序列结束后循环也结束了, 因此在循环结束后还需进行一次
- 按照 reads mapping 的 cigar 序列, 向整数向量
ups_and_downs 中添加 mapped bases 的变化率:} if !record.is_supplementary() && !record.is_secondary() { num_mapped_reads_in_current_contig += 1; } // for each chunk of the cigar string trace!( "read name {:?}", std::str::from_utf8(record.qname()).unwrap() ); let mut cursor: usize = record.pos() as usize; for cig in record.cigar().iter() { trace!("Found cigar {:} from {}", cig, cursor); match cig { Cigar::Match(_) | Cigar::Diff(_) | Cigar::Equal(_) => { // if M, X, or = increment start and decrement end index trace!( "Adding M, X, or = at {} and {}", cursor, cursor + cig.len() as usize ); ups_and_downs[cursor] += 1; let final_pos = cursor + cig.len() as usize; if final_pos < ups_and_downs.len() { // True unless the read hits the contig end. ups_and_downs[final_pos] -= 1; } cursor += cig.len() as usize; } Cigar::Del(_) => { cursor += cig.len() as usize; total_indels_in_current_contig += cig.len() as u64; } Cigar::RefSkip(_) => { // if D or N, move the cursor cursor += cig.len() as usize; } Cigar::Ins(_) => { total_indels_in_current_contig += cig.len() as u64; } Cigar::SoftClip(_) | Cigar::HardClip(_) | Cigar::Pad(_) => {} } }
- 找到所有可比对的区域 (
M, X, or = ) - 比对区域起始处
ups_and_downs[cursor] += 1 - 终止处
ups_and_downs[cursor + cig.len() as usize] -= 1
- 找到所有可比对的区域 (
- 假设 bam 文件已经按 contig 完成排序 (也就是说, 所有比对到同一个 reference 上的 reads 都在一起), 那么需要报告所有上一条 reads 的结果:
process_previous_contigs
- process_previous_contigs 是一个在 contig_coverage 中定义的函数, 接受如下参数:
let mut process_previous_contigs = |last_tid, tid, coverage_estimators: &mut Vec<CoverageEstimator>, ups_and_downs: &[i32], num_mapped_reads_in_current_contig, total_edit_distance_in_current_contig, total_indels_in_current_contig, num_mapped_reads_total: &mut u64| {
- 随后 (忽略
last_tid == -2 即初始情况), 进行覆盖度的估计:for estimator in coverage_estimators.iter_mut() { estimator.add_contig( ups_and_downs, num_mapped_reads_in_current_contig, total_edit_distance_in_current_contig - total_indels_in_current_contig, ) } let coverages: Vec<f32> = coverage_estimators .iter_mut() .map(|estimator| estimator.calculate_coverage(&[0])) .collect();
- 使用
estimator.add_contig 总结当前 contig 的 mapping 情况 - 使用
estimator.calculate_coverage 计算丰度, 此时设unobserved_contig_lengths: &[0] 即忽略较短的 contigs
- 使用
- 输出到表格对应
last_tid 的行中coverage_taker.start_entry( last_tid as usize, std::str::from_utf8(target_names[last_tid as usize]).unwrap(), ); for (coverage, estimator) in coverages.iter().zip(coverage_estimators.iter_mut()) { estimator.print_coverage(coverage, coverage_taker); } coverage_taker.finish_entry();
- 在 contig 中, 无需考虑多个 contig 的情况, 因此重置 estimator
for estimator in coverage_estimators.iter_mut() { estimator.setup(); }
- 随后 (忽略
MeanGenomeCoverageEstimator
add_contig-MeanGenomeCoverageEstimator
- 考虑 mean 方法:
fn add_contig( &mut self, ups_and_downs: &[i32], num_mapped_reads_in_contig: u64, total_mismatches_in_contig: u64, ) { match self { CoverageEstimator::MeanGenomeCoverageEstimator { ref mut total_count, ref mut total_bases, ref mut num_covered_bases, ref mut num_mapped_reads, ref mut total_mismatches, contig_end_exclusion, .. } => { *num_mapped_reads += num_mapped_reads_in_contig; *total_mismatches += total_mismatches_in_contig; let len = ups_and_downs.len(); match *contig_end_exclusion * 2 < len as u64 { true => *total_bases += len as u64 - 2 * *contig_end_exclusion, false => { debug!("Contig too short - less than twice the contig-end-exclusion"); return; //contig is all ends, too short } } let mut cumulative_sum: i32 = 0; let start_from = *contig_end_exclusion as usize; let end_at = len - *contig_end_exclusion as usize - 1; for (i, current) in ups_and_downs.iter().enumerate() { cumulative_sum += current; if i >= start_from && i <= end_at { if cumulative_sum > 0 { *num_covered_bases += 1 } *total_count += cumulative_sum as u64; } }
- 按
ups_and_downs 的信息, 更新estimator : - 最终对应于每条 contig 的
MeanGenomeCoverageEstimator 的信息如下:total_count : u64: 全部能 mapped 到这条 contig 上的 reads 的匹配区域的总碱基数 (modified bycontig_end_exclusion )total_bases : u64: 作为模板的 contig 的总长度 (modified bycontig_end_exclusion )num_covered_bases : u64: 作为模板的 contig 上, 有 reads 覆盖的总长度 (modified bycontig_end_exclusion )num_mapped_reads : u64: 匹配且是首选匹配到这条 contig 上的 reads 的数量- total_mismatches: u64
- min_fraction_covered_bases: f32
contig_end_exclusion : u64: 默认 75, 可能是考虑到 contig 末端匹配的可靠性相对较低- exclude_mismatches: bool
- 按
calculate_coverage-MeanGenomeCoverageEstimator
- 考虑 mean 方法:
fn calculate_coverage(&mut self, unobserved_contig_lengths: &[u64]) -> f32 { match self { CoverageEstimator::MeanGenomeCoverageEstimator { total_count, total_bases, num_covered_bases, num_mapped_reads: _, total_mismatches, contig_end_exclusion, min_fraction_covered_bases, exclude_mismatches, } => { let final_total_bases = *total_bases // + 0 + CoverageEstimator::calculate_unobserved_bases( unobserved_contig_lengths, *contig_end_exclusion, ); debug!( "Calculating coverage with unobserved {:?}, total bases {}, num_covered_bases {}, total_count {}, total_mismatches {}, final_total_bases {}", unobserved_contig_lengths, total_bases, num_covered_bases, total_count, total_mismatches, final_total_bases, ); if final_total_bases == 0 || (*num_covered_bases as f32 / final_total_bases as f32) < *min_fraction_covered_bases { 0.0 } else { let calculated_coverage = match exclude_mismatches { true => (*total_count - *total_mismatches) as f32, false => *total_count as f32, } / final_total_bases as f32; debug!("Found mean coverage {}", calculated_coverage); calculated_coverage }
- 在 contig 方法中,
unobserved_contig_lengths 设为 0 - 若
num_covered_bases / final_total_bases 较低, 则直接返回 0- 意味着所有 map 到这个 contig 上的 reads 全部不算数了~
- 在当前版本中, 暂不考虑
exclude_mismatches - 故
mean 方法的calculated_coverage 计算方式为:total_count / total_bases - 其中每条 contig 两侧长达
contig_end_exclusion 的片段不在计算平均值的范围内.
- 在 contig 方法中,
分析 trimmed_mean 方法
- 假设已有 bam 文件, 不设置 reads 的额外
filter_params , contig 函数主体简写见上- 计算 coverage method 的方法选择是通过
mut estimators_and_taker = EstimatorsAndTaker::generate_from_clap(m, print_stream.clone()); 设置的:let mut estimators_and_taker = EstimatorsAndTaker::generate_from_clap(m, print_stream.clone());
EstimatorsAndTaker::generate_from_clap 具有一个CoverageEstimator 列表 (estimators_and_taker.estimators )- 前述的 “mean” 对应的即为
MeanGenomeCoverageEstimator - “trimmed_mean” 对应的即为
TrimmedMeanGenomeCoverageEstimator - “covered_fraction” 对应的即为
CoverageFractionGenomeCoverageEstimator
- 前述的 “mean” 对应的即为
- 计算 coverage method 的方法选择是通过
- 一直到进行覆盖度的估计, 所有中间的所有过程都没有区别, 直到
- 使用
estimator.add_contig 总结当前 contig 的 mapping 情况 - 使用
estimator.calculate_coverage 计算丰度, 此时设unobserved_contig_lengths: &[0] 即忽略较短的 contigs
- 使用
TrimmedMeanGenomeCoverageEstimator
add_contig-TrimmedMeanGenomeCoverageEstimator
- trimmed_mean 方法:
CoverageEstimator::TrimmedMeanGenomeCoverageEstimator { ref mut counts, ref mut observed_contig_length, ref mut num_covered_bases, ref mut num_mapped_reads, contig_end_exclusion, .. } => { *num_mapped_reads = num_mapped_reads_in_contig; let len1 = ups_and_downs.len(); match *contig_end_exclusion * 2 < len1 as u64 { true => { debug!("Adding len1 {}", len1); *observed_contig_length += len1 as u64 - 2 * *contig_end_exclusion } false => { debug!("Contig too short - less than twice the contig-end-exclusion"); return; //contig is all ends, too short } } debug!("Total observed length now {}", *observed_contig_length); let mut cumulative_sum: i32 = 0; let start_from = *contig_end_exclusion as usize; let end_at = len1 - *contig_end_exclusion as usize - 1; debug!("ups and downs {:?}", ups_and_downs); for (i, current) in ups_and_downs.iter().enumerate() { if *current != 0 { debug!( "At i {}, cumulative sum {} and current {}", i, cumulative_sum, current ); } cumulative_sum += current; if i >= start_from && i <= end_at { if cumulative_sum > 0 { *num_covered_bases += 1 } if counts.len() <= cumulative_sum as usize { (*counts).resize(cumulative_sum as usize + 1, 0); } (*counts)[cumulative_sum as usize] += 1 } }
- 此时不考虑
total_mismatches_in_contig (错配) - 最终对应于每条 contig 的
TrimmedMeanGenomeCoverageEstimator 的信息如下:counts : Vec:counts[n_base] = n_site 记录了 reference contig 上, 有多少 (n_site) 个碱基能比对上给定数量 (n_base) 条的 readsobserved_contig_length : u64: 同MeanGenomeCoverageEstimator.total_bases num_covered_bases : u64: (同上)num_mapped_reads : u64: 当前 contig 的num_mapped_reads *num_mapped_reads = num_mapped_reads_in_contig
- min: f32:
- max: f32:
- min_fraction_covered_bases: f32: (同上)
contig_end_exclusion : u64: (同上)
- 此时不考虑
calculate_coverage-TrimmedMeanGenomeCoverageEstimator
- trimmed_mean 方法 类似于 mean 方法, 但首先考虑了 coverage 为 0 的情况:
CoverageEstimator::TrimmedMeanGenomeCoverageEstimator { counts, observed_contig_length, num_covered_bases, num_mapped_reads: _, contig_end_exclusion, min_fraction_covered_bases, min, max, } => { let unobserved_contig_length = CoverageEstimator::calculate_unobserved_bases( unobserved_contig_lengths, *contig_end_exclusion, ); let total_bases = *observed_contig_length + unobserved_contig_length; debug!("Calculating coverage with num_covered_bases {}, observed_length {}, unobserved_length {:?} and counts {:?}", num_covered_bases, observed_contig_length, unobserved_contig_lengths, counts); match total_bases { 0 => 0.0, _ => { if (*num_covered_bases as f32 / total_bases as f32) < *min_fraction_covered_bases { 0.0 } else { let min_index: usize = (*min * total_bases as f32).floor() as usize; let max_index: usize = (*max * total_bases as f32).ceil() as usize; if *num_covered_bases == 0 { return 0.0; } counts[0] += unobserved_contig_length;
- 同上:
unobserved_contig_lengths 和counts[0] 在 contig 情况下为 0- 忽略所有
num_covered_bases / final_total_bases 较低的 contigs 的 reads 和丰度
- 根据
TrimmedMeanGenomeCoverageEstimator.min 和TrimmedMeanGenomeCoverageEstimator.max , 计算了对应 contig 总长的min_index 和max_index (尽量取大范围) - 其他 (即 coverage 不等于 0) 的情况:
let mut num_accounted_for: usize = 0; let mut total: usize = 0; let mut started = false; for (i, num_covered) in counts.iter().enumerate() { num_accounted_for += *num_covered as usize; debug!( "start: i {}, num_accounted_for {}, total {}, min {}, max {}", i, num_accounted_for, total, min_index, max_index ); if num_accounted_for >= min_index { debug!("inside"); if started { if num_accounted_for > max_index { debug!( "num_accounted_for {}, *num_covered {}", num_accounted_for, *num_covered ); let num_excess = num_accounted_for - *num_covered as usize; let num_wanted = match max_index >= num_excess { true => max_index - num_excess + 1, false => 0, }; debug!("num wanted1: {}", num_wanted); total += num_wanted * i; break; } else { total += *num_covered as usize * i; } } else if num_accounted_for > max_index { // all coverages are the same in the trimmed set total = (max_index - min_index + 1) * i; started = true } else if num_accounted_for < min_index { debug!("too few on first") } else { let num_wanted = num_accounted_for - min_index + 1; debug!("num wanted2: {}", num_wanted); total = num_wanted * i; started = true; } } debug!( "end i {}, num_accounted_for {}, total {}", i, num_accounted_for, total ); } total as f32 / (max_index - min_index) as f32 } } }
- 按 reads 覆盖数 (从小到大) 排序后, 逐步处理其中的序列
- 一开始
started = false , 并逐步累加num_accounted_for (当前处理的总 contig base 数量) - 当超过
min_index 后, 开始累加总匹配 reads 的 base 数到total 中 - 直到超过
min_index , 直接中断返回
- 一开始
- 按 reads 覆盖数 (从小到大) 排序后, 逐步处理其中的序列
- 故
trimmed_mean 方法的calculated_coverage 计算方式为:total_count / total_bases - 其中每条 contig 两侧长达
contig_end_exclusion 的片段不在计算平均值的范围内 - 此外, 覆盖度最高的
max% 和最低的min% 部分的 base 不在计算平均值的范围内
- 同上: