Initial commit of tools under development for data QC through firehose.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2834 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-02-12 19:13:24 +00:00
parent 77af5822d4
commit 04a2784bf7
6 changed files with 564 additions and 42 deletions

View File

@ -0,0 +1,24 @@
package org.broadinstitute.sting.oneoffprojects.filters;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMRecord;
import java.util.Arrays;
import java.util.HashSet;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Feb 8, 2010
*/
public class ContaminatedSampleFilter implements SamRecordFilter {
private final String[] filteredNames = {"NA19562","NA19006","NA19554","NA18985","NA18988","NA19062","NA19559"};
private HashSet<String> contaminatedSamples = new HashSet<String>( Arrays.asList(filteredNames) );
public boolean filterOut(SAMRecord read) {
return contaminatedSamples.contains(read.getReadGroup().getSample());
}
}

View File

@ -0,0 +1,128 @@
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
import java.util.ArrayList;
import java.util.Arrays;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Feb 12, 2010
*/
public class DepthStatisticsCalculator extends SummaryStatisticsCalculator {
private int numBasesAbove0x; // % of bases with ANY coverage
private int numBasesAbove3x; // % of bases with 4x+
private int numBasesAbove9x; // % of bases with 10x+
private int numBasesAbove24x; // % of bases with 25x+
private int numBasesAbove49x; // % of bases with 50x+
private int numBasesAbove99x; // % of bases with 100x+
private int targetsAbove0x;
private int targetsAbove3x;
private int targetsAbove9x;
private int targetsAbove24x;
private int targetsAbove49x;
private int targetsAbove99x;
private int numTargets;
public static double[] DEPTH_CUTOFFS = {1,4,10,25,50,100};
public DepthStatisticsCalculator(String name) {
super(name);
numBasesAbove0x = 0;
numBasesAbove3x = 0;
numBasesAbove9x = 0;
numBasesAbove24x = 0;
numBasesAbove49x = 0;
numBasesAbove99x = 0;
targetsAbove99x = 0;
targetsAbove49x = 0;
targetsAbove24x = 0;
targetsAbove9x = 0;
targetsAbove3x = 0;
targetsAbove0x = 0;
numTargets = 0;
}
public void updateLocus(int depth) {
super.update(depth);
if ( depth > 99 ) {
numBasesAbove99x++;
}
if ( depth > 49 ) {
numBasesAbove49x++;
}
if ( depth > 24 ) {
numBasesAbove24x++;
}
if ( depth > 9 ) {
numBasesAbove9x++;
}
if ( depth > 3 ) {
numBasesAbove3x++;
}
if ( depth > 0 ) {
numBasesAbove0x++;
}
}
public void updateTargets(int targetLength, int totalCoverage) {
double avgCvg = ( (double) totalCoverage ) / ( (double) targetLength);
if ( avgCvg >= 100 ) {
targetsAbove99x ++;
}
if ( avgCvg >= 50 ) {
targetsAbove49x++;
}
if ( avgCvg >= 25 ) {
targetsAbove24x++;
}
if ( avgCvg >= 10 ) {
targetsAbove9x++;
}
if ( avgCvg >= 4 ) {
targetsAbove3x++;
}
if ( avgCvg >= 1 ) {
targetsAbove0x++;
}
numTargets++;
}
public double[] getLocusProportions() {
double[] proportions = new double[6];
proportions[0] = ( (double) numBasesAbove0x )/( (double) getSampleSize() );
proportions[2] = ( (double) numBasesAbove9x )/( (double) getSampleSize() );
proportions[3] = ( (double) numBasesAbove24x )/( (double) getSampleSize() );
proportions[4] = ( (double) numBasesAbove49x )/( (double) getSampleSize() );
proportions[5] = ( (double) numBasesAbove99x )/( (double) getSampleSize() );
proportions[1] = ( (double) numBasesAbove3x )/( (double) getSampleSize() );
return proportions;
}
public double[] getTargetProportions() {
double[] proportions = new double[6];
proportions[0] = ( (double) targetsAbove0x )/( (double) numTargets );
proportions[2] = ( (double) targetsAbove9x )/( (double) numTargets );
proportions[3] = ( (double) targetsAbove24x )/( (double) numTargets );
proportions[4] = ( (double) targetsAbove49x )/( (double) numTargets );
proportions[5] = ( (double) targetsAbove99x )/( (double) numTargets );
proportions[1] = ( (double) targetsAbove3x )/( (double) numTargets );
return proportions;
}
public double getPercentWellCoveredLoci() {
return 10*( (double) numBasesAbove9x )/( (double) getSampleSize() );
}
public double getPercentWellCoveredTargets() {
return 10*( (double) targetsAbove9x )/( (double) numTargets );
}
}

View File

@ -0,0 +1,257 @@
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.*;
import java.util.*;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Feb 11, 2010
*/
class FirehoseSummaryCLP extends CommandLineProgram {
@Argument(fullName = "depthOfCoverageFile", shortName = "doc", doc="Path to the depth of coverage file", required=true)
private File depthOfCoverage = null;
@Argument(fullName = "contaminationFile", shortName = "con", doc="Path to the contamination file", required=true)
private File contamination = null;
@Argument(fullName = "errorRateFile", shortName = "err", doc="Path to the error rate file", required=true)
private File errorRate = null;
@Argument(fullName = "zipFiles", shortName = "zip", doc="List of paths to zip files which contain summary metrics files", required=false)
private String zipFiles = null;
private static String R_SCRIPT = "plotFirehoseDataQCMetrics.R";
private static String SCRIPT_DOC_FLAG = "DOC";
protected int execute() {
SummaryFileCollection metricsFiles = getFileHandles();
List<DepthStatisticsCalculator> depthStats = calculateDepthStatistics(depthOfCoverage);
String docSummary = makeDOCPlots(depthStats);
return 1;
}
private String makeDOCPlots(List<DepthStatisticsCalculator> calcs) {
StringBuilder summaryBuilder = new StringBuilder();
int numSamplesWithGoodBaseCoverage=0;
int numSamplesWithGoodMeanCoverage = 0;
int numSamplesWithGoodTargetCoverage=0;
double aggregateMeanCoverage = -1d;
double aggregateStdDevCoverage = -1d;
double aggregateSkewCoverage = -1d;
PrintWriter locusOutput;
PrintWriter targetOutput;
File locusOutputFile;
File targetOutputFile;
try {
locusOutputFile = File.createTempFile("locus_output","txt");
locusOutput = new PrintWriter(locusOutputFile);
targetOutputFile = File.createTempFile("target_output","txt");
targetOutput = new PrintWriter(targetOutputFile);
} catch ( IOException e ) {
throw new StingException("Error creating temporary output files for R-plotting. Perhaps user permissions do not allow creation of files.",e);
}
for ( DepthStatisticsCalculator calc : calcs ) {
if ( calc.getName().equalsIgnoreCase("total_coverage") || calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
// update summary statistics
if ( calc.getName().equalsIgnoreCase("total_coverage") ) {
aggregateMeanCoverage = calc.getMean();
aggregateStdDevCoverage = Math.sqrt(calc.getVar());
aggregateSkewCoverage = calc.getSkew();
}
} else {
// update sample-based summary statistics and print output to file
locusOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n",calc.getName(),calc.getLocusProportions());
targetOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n", calc.getName(),calc.getTargetProportions());
if ( calc.getPercentWellCoveredLoci() > 80 ) {
numSamplesWithGoodBaseCoverage++;
}
if ( calc.getPercentWellCoveredTargets() > 90 ) {
numSamplesWithGoodTargetCoverage++;
}
if ( calc.getMean() > 10 ) {
numSamplesWithGoodMeanCoverage++;
}
}
}
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfBasesCoveredBySample",locusOutputFile.getAbsolutePath());
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfTargetsCoveredBySample",targetOutputFile.getAbsolutePath());
return "temporary";
}
private SummaryFileCollection getFileHandles() {
if ( zipFiles == null ) {
return null;
}
SummaryFileCollection summaryFiles = new SummaryFileCollection();
for ( String zipFile : zipFiles.split(",") ) {
summaryFiles.process(zipFile);
}
return summaryFiles;
}
private List<DepthStatisticsCalculator> calculateDepthStatistics(File docFile) {
BufferedReader docReader;
try {
docReader = new BufferedReader( new FileReader(docFile) );
} catch ( IOException e) {
throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e);
}
String locusHeader = getDOCSectionHeader(docReader);
List<DepthStatisticsCalculator> docCalculators = instantiateDOCCalculators(locusHeader);
updateLocusInfo(docCalculators,docReader);
String targetHeader = getDOCSectionHeader(docReader);
updateTargetInfo(docCalculators,docReader);
return docCalculators;
}
private void updateLocusInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
String docLocLine;
try {
docLocLine = reader.readLine();
while ( ! isEndOfSection(docLocLine) ) {
int offset = -1;
for ( String entry : docLocLine.split("\t") ) {
if ( offset > -1 ) {
calcs.get(offset).updateLocus(Integer.parseInt(entry));
}
offset++;
}
}
} catch ( IOException e) {
throw new StingException("Error reading locus depth of coverage information",e);
}
}
private void updateTargetInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
String docLocLine;
try {
docLocLine = reader.readLine();
while ( ! isEndOfSection(docLocLine) ) {
int offset = -1;
int targetSize = 0;
for ( String entry : docLocLine.split("\t") ) {
if ( offset == -1 ) {
targetSize = parseInterval(entry);
} else {
calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry));
}
offset++;
}
}
} catch ( IOException e ) {
throw new StingException("Error reading target depth of coverage information",e);
}
}
private int parseInterval(String interval) {
String startstop = interval.split(":")[1];
int start = Integer.parseInt(startstop.split("-")[0]);
int stop = Integer.parseInt(startstop.split("-")[1]);
return stop - start;
}
private boolean isDOCSectionSeparator( String line ) {
return line.contains("_COVERAGE_SECTION");
}
private boolean isEndOfSection( String line ) {
// sections delimited by empty line
return line.equalsIgnoreCase("");
}
private String getDOCSectionHeader(BufferedReader reader) {
String header;
try {
do {
header = reader.readLine();
} while ( ! isDOCSectionSeparator(header) );
header = reader.readLine();
} catch (IOException e) {
throw new StingException("Error reading depth of coverage file",e);
}
return header;
}
private List<DepthStatisticsCalculator> instantiateDOCCalculators(String header) {
List<DepthStatisticsCalculator> calcs = new ArrayList<DepthStatisticsCalculator>();
int offset = -1;
for ( String entry : header.split("\t") ) {
if ( offset > -1 ) {
calcs.add(new DepthStatisticsCalculator(entry));
}
offset++;
}
return calcs;
}
}
public class GenerateFirehoseSummary {
public static void main(String[] args) {
FirehoseSummaryCLP clp = new FirehoseSummaryCLP();
CommandLineProgram.start(clp, args);
System.exit(0);
}
}
class SummaryFileCollection {
// container class for files we'll be summarizing
public Map<String,File> fingerprintSummaryFiles;
public Map<String,File> hybridSelectionMetricsFiles;
public Map<String,File> insertSizeDistributionFiles;
public Map<String,File> alignmentMetricsFiles;
public SummaryFileCollection() {
fingerprintSummaryFiles = new HashMap<String,File>();
hybridSelectionMetricsFiles = new HashMap<String, File>();
insertSizeDistributionFiles = new HashMap<String,File>();
alignmentMetricsFiles = new HashMap<String,File>();
}
public void process(String zipFilePath) {
String sampleName = zipFilePath.split("_sequencing_metrics.zip")[0].split("_")[1];
File fingerprintSummaryFile = new File(sampleName+".summary_fingerprint_metrics");
File hybridSelectionFile = new File(sampleName+".hybrid_selection_metrics");
File insertSizeFile = new File(sampleName+".insert_size_metrics");
File alignmentFile = new File(sampleName+".alignment_metrics");
String command = "unzip "+zipFilePath;
try {
Process p = Runtime.getRuntime().exec(command);
} catch (IOException e) {
throw new RuntimeException("Could not unzip the file "+zipFilePath);
}
fingerprintSummaryFiles.put(sampleName,fingerprintSummaryFile);
hybridSelectionMetricsFiles.put(sampleName,hybridSelectionFile);
insertSizeDistributionFiles.put(sampleName,insertSizeFile);
alignmentMetricsFiles.put(sampleName,alignmentFile);
}
}

View File

@ -0,0 +1,69 @@
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
/**
* This object calculates the first three sample moments of a data set on-the-fly
* @Author chartl
* @Date Feb 12, 2010
*/
public class SummaryStatisticsCalculator {
// todo -- can median / quantiles be estimated on-the-fly?
// todo -- can we divine a metric for modality?
private double mean; // mean of the samples fed to it
private double var; // variance of the samples fed to it
private double skew; // skew of the samples fed to it
private int sampleSize; // number of samples given to calculator
private String name; // name to associate calculator with, if any
public SummaryStatisticsCalculator() {
mean = 0;
var = 0;
skew = 0;
sampleSize = 0;
name = "SUMMARY";
}
public SummaryStatisticsCalculator(String name) {
mean = 0;
var = 0;
skew = 0;
sampleSize = 0;
this.name = name;
}
public void update(double datum) {
mean = (sampleSize*mean + datum)/(sampleSize + 1);
var = (sampleSize*var + Math.pow( mean - datum , 2 ) )/( sampleSize + 1);
// note: the variance is not re-calculated over all data points each time the mean changes
// this means the convergence will be slower than x^-(1/2); but it will still converge with the mean
// whole exome is ~3.3 million datapoints, which will be plenty for this to converge to within a very
// small proportion of the true sample variance
skew = ( skew*Math.pow(var,2/3)*sampleSize+ Math.pow( datum - mean, 3) ) / ( ( sampleSize + 1 )* Math.pow(var,2/3) );
// again, new mean/variance estimates are not propagated over all previous data points, so convergence is a bit slower
// but that's the price one pays for on-the-fly calculation
sampleSize++;
}
public void update(int datum) {
update( (double) datum);
}
public double getMean() {
return mean;
}
public double getVar() {
return var;
}
public double getSkew() {
return skew;
}
public String getName() {
return name;
}
public int getSampleSize() {
return sampleSize;
}
}

View File

@ -20,33 +20,33 @@ class VCFConcordanceCalculator {
private int minimumDepthForUpdate;
private int minimumGenotypeQuality;
private String name;
private Set<GenomeLoc> falsePositiveLoci;
private Set<GenomeLoc> falseNegativeLoci;
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
private Set<GenomeLoc> falseNegativeLociDueToFilters;
private Set<GenomeLoc> hetsCalledHoms;
private Set<GenomeLoc> homsCalledHets;
private Set<GenomeLoc> nonConfidentGenotypeCalls;
private Set<GenomeLoc> concordantHomCalls;
private Set<GenomeLoc> concordantHetCalls;
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
private Set<GenomeLoc> chipNoCalls;
private Set<GenomeLoc> ignoredDueToDepth;
private int falsePositiveLoci;
private int falseNegativeLoci;
private int falseNegativeLociDueToNoCall;
private int falseNegativeLociDueToFilters;
private int hetsCalledHoms;
private int homsCalledHets;
private int nonConfidentGenotypeCalls;
private int concordantHomCalls;
private int concordantHetCalls;
private int concordantGenotypeReferenceCalls;
private int chipNoCalls;
private int ignoredDueToDepth;
public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) {
name = sampleName;
falseNegativeLoci = new HashSet<GenomeLoc>();
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
falsePositiveLoci = new HashSet<GenomeLoc>();
falseNegativeLociDueToFilters = new HashSet<GenomeLoc>();
hetsCalledHoms = new HashSet<GenomeLoc>();
homsCalledHets = new HashSet<GenomeLoc>();
nonConfidentGenotypeCalls = new HashSet<GenomeLoc>();
concordantHomCalls = new HashSet<GenomeLoc>();
concordantHetCalls = new HashSet<GenomeLoc>();
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
chipNoCalls = new HashSet<GenomeLoc>();
ignoredDueToDepth = new HashSet<GenomeLoc>();
falseNegativeLoci = 0;
falseNegativeLociDueToNoCall = 0;
falsePositiveLoci = 0;
falseNegativeLociDueToFilters = 0;
hetsCalledHoms = 0;
homsCalledHets = 0;
nonConfidentGenotypeCalls = 0;
concordantHomCalls = 0;
concordantHetCalls = 0;
concordantGenotypeReferenceCalls = 0;
chipNoCalls = 0;
ignoredDueToDepth = 0;
minimumDepthForUpdate = minimumDepth;
minimumGenotypeQuality = minGenQual;
}
@ -57,37 +57,37 @@ class VCFConcordanceCalculator {
public void updateTruthOnly(LocusConcordanceInfo info) {
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) {
falseNegativeLoci.add(info.getLoc());
falseNegativeLoci++;
} else {
concordantGenotypeReferenceCalls.add(info.getLoc());
concordantGenotypeReferenceCalls++;
}
}
public void updateFilteredLocus(LocusConcordanceInfo info) {
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase()) ) {
falseNegativeLociDueToFilters.add(info.getLoc());
falseNegativeLociDueToFilters++;
}
}
public String toString() {
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),
concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),nonConfidentGenotypeCalls.size(),
homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),
falseNegativeLociDueToNoCall.size(),falseNegativeLociDueToFilters.size());
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth,
concordantGenotypeReferenceCalls,concordantHomCalls,concordantHetCalls,nonConfidentGenotypeCalls,
homsCalledHets,hetsCalledHoms,falsePositiveLoci,falseNegativeLoci,
falseNegativeLociDueToNoCall,falseNegativeLociDueToFilters);
}
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
if ( minimumDepthForUpdate > 0 && call.getReadCount() < minimumDepthForUpdate ) {
ignoredDueToDepth.add(loc);
ignoredDueToDepth++;
} else if ( truth.isNoCall() ) {
chipNoCalls.add(loc);
chipNoCalls++;
} else if ( truth.isVariant(( char) ref) ) {
if ( call.isNoCall() ) {
falseNegativeLociDueToNoCall.add(loc);
falseNegativeLociDueToNoCall++;
} else if ( ! call.isVariant( (char) ref ) ) {
falseNegativeLoci.add(loc);
falseNegativeLoci++;
} else if ( call.isVariant((char) ref) ) {
// check het vs hom
checkGenotypeCall(truth,call, loc);
@ -96,9 +96,9 @@ class VCFConcordanceCalculator {
} else if ( ! truth.isVariant( (char) ref ) ) {
if ( call.isVariant((char) ref) ) {
falsePositiveLoci.add(loc);
falsePositiveLoci++;
} else {
concordantGenotypeReferenceCalls.add(loc);
concordantGenotypeReferenceCalls++;
}
}
}
@ -107,18 +107,18 @@ class VCFConcordanceCalculator {
if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) {
if ( truth.isHet() && call.isHom() ) {
hetsCalledHoms.add(loc);
hetsCalledHoms++;
} else if ( truth.isHom() && call.isHet() ) {
homsCalledHets.add(loc);
homsCalledHets++;
} else if ( ( truth.isHet() && call.isHet() ) ) {
concordantHetCalls.add(loc);
concordantHetCalls++;
} else if ( truth.isHom() && call.isHom() ) { // be extra careful
concordantHomCalls.add(loc);
concordantHomCalls++;
}
} else {
nonConfidentGenotypeCalls.add(loc);
nonConfidentGenotypeCalls++;
}
}

View File

@ -14,6 +14,10 @@ import net.sf.samtools.SAMReadGroupRecord;
import cern.jet.stat.Probability;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
* to be contaminants (meaning a read group that's not actually associated with the sample) by searching
@ -28,8 +32,14 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
private Double LIMIT = 1e-9;
@Argument(fullName="scaleForSample", shortName="scale", doc="the scale by which the pvalue limit should reduce for testing samples directly. "+
"E.g. if a sample has three 1e-3 read groups, pvalue is 1e-9 -- significant; so the scale should reduce by some multiplicative factor"+
"For each read group associated with the sample. Defaults to 1e-4 [1e-9 for 1 RG, 1e-13 for 2 RG, 1e-17 for 3, etc]", required=false)
private Double SCALE = 1e-4;
private UnifiedGenotyperEngine ug;
private NamedTable altTable;
private final double EPSILON = 1e-20;
public void initialize() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
@ -140,6 +150,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
//out.println("readgroup\tpvalue\tstatus\tbalances");
out.printf("%-10s\t%-13s\t%-10s\t%-10s%n", "readgroup", "pvalue", "status", "balances");
HashMap<String,Double> pvalByReadGroup = new HashMap<String,Double>();
for (String rg : altTable.getRowNames()) {
String balances = "";
@ -179,6 +190,8 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
// Compute pValue
double pValue = Probability.studentT(dof, t);
pValue = pValue < EPSILON ? EPSILON : pValue;
pvalByReadGroup.put(rg,pValue);
//out.printf("%s\t%e\t%s\t[%s]\n", rg, pValue, (pValue < LIMIT ? "aberrant" : "nominal"), balances);
out.printf("%-10s\t%-13s\t%-10s\t[%-10s]\n",
@ -186,6 +199,37 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
String.format("%e", pValue),
(pValue < LIMIT ? "aberrant" : "nominal"),
balances);
logger.debug(rg);
}
out.printf("%n%n%s%n","SECTION ON BADLY CONTAMINATED SAMPLES");
out.printf("%s\t%s\t%s\t%s%n","sample","p-value","status","info");
HashMap<String,List<String>> samplesToReadGroups = new HashMap<String,List<String>>();
for ( SAMReadGroupRecord rec : getToolkit().getSAMFileHeader().getReadGroups() ) {
if ( samplesToReadGroups.containsKey(rec.getSample()) ) {
samplesToReadGroups.get(rec.getSample()).add(rec.getReadGroupId());
} else {
ArrayList<String> newList = new ArrayList<String>();
newList.add(rec.getReadGroupId());
samplesToReadGroups.put(rec.getSample(),newList);
}
}
for ( String sample : samplesToReadGroups.keySet() ) {
double p_value = 1;
double limit = LIMIT;
boolean containsAberrantReads = false;
for ( String rg : samplesToReadGroups.get(sample) ) {
double rg_pval = ( pvalByReadGroup.get(rg) == null ? 1 : pvalByReadGroup.get(rg) );
p_value = p_value*rg_pval;
containsAberrantReads = containsAberrantReads || rg_pval < LIMIT;
limit = limit*SCALE;
logger.debug(rg);
}
out.printf("%s\t%-13s\t%s\t%s%n", sample, String.format("%e",p_value), ( p_value < limit ? "aberrant" : "nominal"), ( containsAberrantReads ? "contains_aberrant_RG" : "no_aberrant_RG"));
}
}
}