Initial commit of tools under development for data QC through firehose.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2834 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
77af5822d4
commit
04a2784bf7
|
|
@ -0,0 +1,24 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.filters;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 8, 2010
|
||||
*/
|
||||
public class ContaminatedSampleFilter implements SamRecordFilter {
|
||||
|
||||
private final String[] filteredNames = {"NA19562","NA19006","NA19554","NA18985","NA18988","NA19062","NA19559"};
|
||||
|
||||
private HashSet<String> contaminatedSamples = new HashSet<String>( Arrays.asList(filteredNames) );
|
||||
|
||||
public boolean filterOut(SAMRecord read) {
|
||||
return contaminatedSamples.contains(read.getReadGroup().getSample());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 12, 2010
|
||||
*/
|
||||
public class DepthStatisticsCalculator extends SummaryStatisticsCalculator {
|
||||
private int numBasesAbove0x; // % of bases with ANY coverage
|
||||
private int numBasesAbove3x; // % of bases with 4x+
|
||||
private int numBasesAbove9x; // % of bases with 10x+
|
||||
private int numBasesAbove24x; // % of bases with 25x+
|
||||
private int numBasesAbove49x; // % of bases with 50x+
|
||||
private int numBasesAbove99x; // % of bases with 100x+
|
||||
|
||||
private int targetsAbove0x;
|
||||
private int targetsAbove3x;
|
||||
private int targetsAbove9x;
|
||||
private int targetsAbove24x;
|
||||
private int targetsAbove49x;
|
||||
private int targetsAbove99x;
|
||||
private int numTargets;
|
||||
|
||||
public static double[] DEPTH_CUTOFFS = {1,4,10,25,50,100};
|
||||
|
||||
public DepthStatisticsCalculator(String name) {
|
||||
super(name);
|
||||
numBasesAbove0x = 0;
|
||||
numBasesAbove3x = 0;
|
||||
numBasesAbove9x = 0;
|
||||
numBasesAbove24x = 0;
|
||||
numBasesAbove49x = 0;
|
||||
numBasesAbove99x = 0;
|
||||
targetsAbove99x = 0;
|
||||
targetsAbove49x = 0;
|
||||
targetsAbove24x = 0;
|
||||
targetsAbove9x = 0;
|
||||
targetsAbove3x = 0;
|
||||
targetsAbove0x = 0;
|
||||
numTargets = 0;
|
||||
}
|
||||
|
||||
public void updateLocus(int depth) {
|
||||
super.update(depth);
|
||||
|
||||
if ( depth > 99 ) {
|
||||
numBasesAbove99x++;
|
||||
}
|
||||
if ( depth > 49 ) {
|
||||
numBasesAbove49x++;
|
||||
}
|
||||
if ( depth > 24 ) {
|
||||
numBasesAbove24x++;
|
||||
}
|
||||
if ( depth > 9 ) {
|
||||
numBasesAbove9x++;
|
||||
}
|
||||
if ( depth > 3 ) {
|
||||
numBasesAbove3x++;
|
||||
}
|
||||
if ( depth > 0 ) {
|
||||
numBasesAbove0x++;
|
||||
}
|
||||
}
|
||||
|
||||
public void updateTargets(int targetLength, int totalCoverage) {
|
||||
double avgCvg = ( (double) totalCoverage ) / ( (double) targetLength);
|
||||
|
||||
if ( avgCvg >= 100 ) {
|
||||
targetsAbove99x ++;
|
||||
}
|
||||
if ( avgCvg >= 50 ) {
|
||||
targetsAbove49x++;
|
||||
}
|
||||
if ( avgCvg >= 25 ) {
|
||||
targetsAbove24x++;
|
||||
}
|
||||
if ( avgCvg >= 10 ) {
|
||||
targetsAbove9x++;
|
||||
}
|
||||
if ( avgCvg >= 4 ) {
|
||||
targetsAbove3x++;
|
||||
}
|
||||
if ( avgCvg >= 1 ) {
|
||||
targetsAbove0x++;
|
||||
}
|
||||
|
||||
numTargets++;
|
||||
}
|
||||
|
||||
public double[] getLocusProportions() {
|
||||
double[] proportions = new double[6];
|
||||
|
||||
proportions[0] = ( (double) numBasesAbove0x )/( (double) getSampleSize() );
|
||||
proportions[2] = ( (double) numBasesAbove9x )/( (double) getSampleSize() );
|
||||
proportions[3] = ( (double) numBasesAbove24x )/( (double) getSampleSize() );
|
||||
proportions[4] = ( (double) numBasesAbove49x )/( (double) getSampleSize() );
|
||||
proportions[5] = ( (double) numBasesAbove99x )/( (double) getSampleSize() );
|
||||
proportions[1] = ( (double) numBasesAbove3x )/( (double) getSampleSize() );
|
||||
|
||||
return proportions;
|
||||
}
|
||||
|
||||
public double[] getTargetProportions() {
|
||||
double[] proportions = new double[6];
|
||||
|
||||
proportions[0] = ( (double) targetsAbove0x )/( (double) numTargets );
|
||||
proportions[2] = ( (double) targetsAbove9x )/( (double) numTargets );
|
||||
proportions[3] = ( (double) targetsAbove24x )/( (double) numTargets );
|
||||
proportions[4] = ( (double) targetsAbove49x )/( (double) numTargets );
|
||||
proportions[5] = ( (double) targetsAbove99x )/( (double) numTargets );
|
||||
proportions[1] = ( (double) targetsAbove3x )/( (double) numTargets );
|
||||
|
||||
return proportions;
|
||||
}
|
||||
|
||||
public double getPercentWellCoveredLoci() {
|
||||
return 10*( (double) numBasesAbove9x )/( (double) getSampleSize() );
|
||||
}
|
||||
|
||||
public double getPercentWellCoveredTargets() {
|
||||
return 10*( (double) targetsAbove9x )/( (double) numTargets );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 11, 2010
|
||||
*/
|
||||
|
||||
class FirehoseSummaryCLP extends CommandLineProgram {
|
||||
@Argument(fullName = "depthOfCoverageFile", shortName = "doc", doc="Path to the depth of coverage file", required=true)
|
||||
private File depthOfCoverage = null;
|
||||
@Argument(fullName = "contaminationFile", shortName = "con", doc="Path to the contamination file", required=true)
|
||||
private File contamination = null;
|
||||
@Argument(fullName = "errorRateFile", shortName = "err", doc="Path to the error rate file", required=true)
|
||||
private File errorRate = null;
|
||||
@Argument(fullName = "zipFiles", shortName = "zip", doc="List of paths to zip files which contain summary metrics files", required=false)
|
||||
private String zipFiles = null;
|
||||
|
||||
private static String R_SCRIPT = "plotFirehoseDataQCMetrics.R";
|
||||
private static String SCRIPT_DOC_FLAG = "DOC";
|
||||
|
||||
protected int execute() {
|
||||
SummaryFileCollection metricsFiles = getFileHandles();
|
||||
List<DepthStatisticsCalculator> depthStats = calculateDepthStatistics(depthOfCoverage);
|
||||
String docSummary = makeDOCPlots(depthStats);
|
||||
return 1;
|
||||
}
|
||||
|
||||
private String makeDOCPlots(List<DepthStatisticsCalculator> calcs) {
|
||||
StringBuilder summaryBuilder = new StringBuilder();
|
||||
int numSamplesWithGoodBaseCoverage=0;
|
||||
int numSamplesWithGoodMeanCoverage = 0;
|
||||
int numSamplesWithGoodTargetCoverage=0;
|
||||
double aggregateMeanCoverage = -1d;
|
||||
double aggregateStdDevCoverage = -1d;
|
||||
double aggregateSkewCoverage = -1d;
|
||||
PrintWriter locusOutput;
|
||||
PrintWriter targetOutput;
|
||||
File locusOutputFile;
|
||||
File targetOutputFile;
|
||||
try {
|
||||
locusOutputFile = File.createTempFile("locus_output","txt");
|
||||
locusOutput = new PrintWriter(locusOutputFile);
|
||||
targetOutputFile = File.createTempFile("target_output","txt");
|
||||
targetOutput = new PrintWriter(targetOutputFile);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error creating temporary output files for R-plotting. Perhaps user permissions do not allow creation of files.",e);
|
||||
}
|
||||
|
||||
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||
|
||||
if ( calc.getName().equalsIgnoreCase("total_coverage") || calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||
// update summary statistics
|
||||
if ( calc.getName().equalsIgnoreCase("total_coverage") ) {
|
||||
aggregateMeanCoverage = calc.getMean();
|
||||
aggregateStdDevCoverage = Math.sqrt(calc.getVar());
|
||||
aggregateSkewCoverage = calc.getSkew();
|
||||
}
|
||||
} else {
|
||||
// update sample-based summary statistics and print output to file
|
||||
locusOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n",calc.getName(),calc.getLocusProportions());
|
||||
targetOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n", calc.getName(),calc.getTargetProportions());
|
||||
if ( calc.getPercentWellCoveredLoci() > 80 ) {
|
||||
numSamplesWithGoodBaseCoverage++;
|
||||
}
|
||||
if ( calc.getPercentWellCoveredTargets() > 90 ) {
|
||||
numSamplesWithGoodTargetCoverage++;
|
||||
}
|
||||
if ( calc.getMean() > 10 ) {
|
||||
numSamplesWithGoodMeanCoverage++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfBasesCoveredBySample",locusOutputFile.getAbsolutePath());
|
||||
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfTargetsCoveredBySample",targetOutputFile.getAbsolutePath());
|
||||
return "temporary";
|
||||
}
|
||||
|
||||
private SummaryFileCollection getFileHandles() {
|
||||
if ( zipFiles == null ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
SummaryFileCollection summaryFiles = new SummaryFileCollection();
|
||||
for ( String zipFile : zipFiles.split(",") ) {
|
||||
summaryFiles.process(zipFile);
|
||||
}
|
||||
|
||||
return summaryFiles;
|
||||
}
|
||||
|
||||
private List<DepthStatisticsCalculator> calculateDepthStatistics(File docFile) {
|
||||
BufferedReader docReader;
|
||||
|
||||
try {
|
||||
docReader = new BufferedReader( new FileReader(docFile) );
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e);
|
||||
}
|
||||
|
||||
String locusHeader = getDOCSectionHeader(docReader);
|
||||
List<DepthStatisticsCalculator> docCalculators = instantiateDOCCalculators(locusHeader);
|
||||
|
||||
updateLocusInfo(docCalculators,docReader);
|
||||
|
||||
String targetHeader = getDOCSectionHeader(docReader);
|
||||
|
||||
updateTargetInfo(docCalculators,docReader);
|
||||
|
||||
return docCalculators;
|
||||
}
|
||||
|
||||
private void updateLocusInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||
|
||||
String docLocLine;
|
||||
try {
|
||||
docLocLine = reader.readLine();
|
||||
while ( ! isEndOfSection(docLocLine) ) {
|
||||
int offset = -1;
|
||||
for ( String entry : docLocLine.split("\t") ) {
|
||||
if ( offset > -1 ) {
|
||||
calcs.get(offset).updateLocus(Integer.parseInt(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("Error reading locus depth of coverage information",e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void updateTargetInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||
|
||||
String docLocLine;
|
||||
try {
|
||||
docLocLine = reader.readLine();
|
||||
while ( ! isEndOfSection(docLocLine) ) {
|
||||
int offset = -1;
|
||||
int targetSize = 0;
|
||||
for ( String entry : docLocLine.split("\t") ) {
|
||||
if ( offset == -1 ) {
|
||||
targetSize = parseInterval(entry);
|
||||
} else {
|
||||
calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error reading target depth of coverage information",e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int parseInterval(String interval) {
|
||||
String startstop = interval.split(":")[1];
|
||||
int start = Integer.parseInt(startstop.split("-")[0]);
|
||||
int stop = Integer.parseInt(startstop.split("-")[1]);
|
||||
return stop - start;
|
||||
}
|
||||
|
||||
private boolean isDOCSectionSeparator( String line ) {
|
||||
return line.contains("_COVERAGE_SECTION");
|
||||
}
|
||||
|
||||
private boolean isEndOfSection( String line ) {
|
||||
// sections delimited by empty line
|
||||
return line.equalsIgnoreCase("");
|
||||
}
|
||||
|
||||
private String getDOCSectionHeader(BufferedReader reader) {
|
||||
String header;
|
||||
try {
|
||||
do {
|
||||
header = reader.readLine();
|
||||
} while ( ! isDOCSectionSeparator(header) );
|
||||
|
||||
header = reader.readLine();
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Error reading depth of coverage file",e);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
private List<DepthStatisticsCalculator> instantiateDOCCalculators(String header) {
|
||||
List<DepthStatisticsCalculator> calcs = new ArrayList<DepthStatisticsCalculator>();
|
||||
|
||||
int offset = -1;
|
||||
for ( String entry : header.split("\t") ) {
|
||||
if ( offset > -1 ) {
|
||||
calcs.add(new DepthStatisticsCalculator(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
|
||||
return calcs;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class GenerateFirehoseSummary {
|
||||
|
||||
public static void main(String[] args) {
|
||||
FirehoseSummaryCLP clp = new FirehoseSummaryCLP();
|
||||
CommandLineProgram.start(clp, args);
|
||||
System.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
class SummaryFileCollection {
|
||||
|
||||
// container class for files we'll be summarizing
|
||||
|
||||
public Map<String,File> fingerprintSummaryFiles;
|
||||
public Map<String,File> hybridSelectionMetricsFiles;
|
||||
public Map<String,File> insertSizeDistributionFiles;
|
||||
public Map<String,File> alignmentMetricsFiles;
|
||||
|
||||
public SummaryFileCollection() {
|
||||
fingerprintSummaryFiles = new HashMap<String,File>();
|
||||
hybridSelectionMetricsFiles = new HashMap<String, File>();
|
||||
insertSizeDistributionFiles = new HashMap<String,File>();
|
||||
alignmentMetricsFiles = new HashMap<String,File>();
|
||||
}
|
||||
|
||||
public void process(String zipFilePath) {
|
||||
String sampleName = zipFilePath.split("_sequencing_metrics.zip")[0].split("_")[1];
|
||||
File fingerprintSummaryFile = new File(sampleName+".summary_fingerprint_metrics");
|
||||
File hybridSelectionFile = new File(sampleName+".hybrid_selection_metrics");
|
||||
File insertSizeFile = new File(sampleName+".insert_size_metrics");
|
||||
File alignmentFile = new File(sampleName+".alignment_metrics");
|
||||
|
||||
String command = "unzip "+zipFilePath;
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec(command);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not unzip the file "+zipFilePath);
|
||||
}
|
||||
|
||||
fingerprintSummaryFiles.put(sampleName,fingerprintSummaryFile);
|
||||
hybridSelectionMetricsFiles.put(sampleName,hybridSelectionFile);
|
||||
insertSizeDistributionFiles.put(sampleName,insertSizeFile);
|
||||
alignmentMetricsFiles.put(sampleName,alignmentFile);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
/**
|
||||
* This object calculates the first three sample moments of a data set on-the-fly
|
||||
* @Author chartl
|
||||
* @Date Feb 12, 2010
|
||||
*/
|
||||
public class SummaryStatisticsCalculator {
|
||||
// todo -- can median / quantiles be estimated on-the-fly?
|
||||
// todo -- can we divine a metric for modality?
|
||||
private double mean; // mean of the samples fed to it
|
||||
private double var; // variance of the samples fed to it
|
||||
private double skew; // skew of the samples fed to it
|
||||
private int sampleSize; // number of samples given to calculator
|
||||
private String name; // name to associate calculator with, if any
|
||||
|
||||
public SummaryStatisticsCalculator() {
|
||||
mean = 0;
|
||||
var = 0;
|
||||
skew = 0;
|
||||
sampleSize = 0;
|
||||
name = "SUMMARY";
|
||||
}
|
||||
|
||||
public SummaryStatisticsCalculator(String name) {
|
||||
mean = 0;
|
||||
var = 0;
|
||||
skew = 0;
|
||||
sampleSize = 0;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void update(double datum) {
|
||||
mean = (sampleSize*mean + datum)/(sampleSize + 1);
|
||||
var = (sampleSize*var + Math.pow( mean - datum , 2 ) )/( sampleSize + 1);
|
||||
// note: the variance is not re-calculated over all data points each time the mean changes
|
||||
// this means the convergence will be slower than x^-(1/2); but it will still converge with the mean
|
||||
// whole exome is ~3.3 million datapoints, which will be plenty for this to converge to within a very
|
||||
// small proportion of the true sample variance
|
||||
skew = ( skew*Math.pow(var,2/3)*sampleSize+ Math.pow( datum - mean, 3) ) / ( ( sampleSize + 1 )* Math.pow(var,2/3) );
|
||||
// again, new mean/variance estimates are not propagated over all previous data points, so convergence is a bit slower
|
||||
// but that's the price one pays for on-the-fly calculation
|
||||
sampleSize++;
|
||||
}
|
||||
|
||||
public void update(int datum) {
|
||||
update( (double) datum);
|
||||
}
|
||||
|
||||
public double getMean() {
|
||||
return mean;
|
||||
}
|
||||
|
||||
public double getVar() {
|
||||
return var;
|
||||
}
|
||||
|
||||
public double getSkew() {
|
||||
return skew;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public int getSampleSize() {
|
||||
return sampleSize;
|
||||
}
|
||||
}
|
||||
|
|
@ -20,33 +20,33 @@ class VCFConcordanceCalculator {
|
|||
private int minimumDepthForUpdate;
|
||||
private int minimumGenotypeQuality;
|
||||
private String name;
|
||||
private Set<GenomeLoc> falsePositiveLoci;
|
||||
private Set<GenomeLoc> falseNegativeLoci;
|
||||
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
|
||||
private Set<GenomeLoc> falseNegativeLociDueToFilters;
|
||||
private Set<GenomeLoc> hetsCalledHoms;
|
||||
private Set<GenomeLoc> homsCalledHets;
|
||||
private Set<GenomeLoc> nonConfidentGenotypeCalls;
|
||||
private Set<GenomeLoc> concordantHomCalls;
|
||||
private Set<GenomeLoc> concordantHetCalls;
|
||||
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
|
||||
private Set<GenomeLoc> chipNoCalls;
|
||||
private Set<GenomeLoc> ignoredDueToDepth;
|
||||
private int falsePositiveLoci;
|
||||
private int falseNegativeLoci;
|
||||
private int falseNegativeLociDueToNoCall;
|
||||
private int falseNegativeLociDueToFilters;
|
||||
private int hetsCalledHoms;
|
||||
private int homsCalledHets;
|
||||
private int nonConfidentGenotypeCalls;
|
||||
private int concordantHomCalls;
|
||||
private int concordantHetCalls;
|
||||
private int concordantGenotypeReferenceCalls;
|
||||
private int chipNoCalls;
|
||||
private int ignoredDueToDepth;
|
||||
|
||||
public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) {
|
||||
name = sampleName;
|
||||
falseNegativeLoci = new HashSet<GenomeLoc>();
|
||||
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
|
||||
falsePositiveLoci = new HashSet<GenomeLoc>();
|
||||
falseNegativeLociDueToFilters = new HashSet<GenomeLoc>();
|
||||
hetsCalledHoms = new HashSet<GenomeLoc>();
|
||||
homsCalledHets = new HashSet<GenomeLoc>();
|
||||
nonConfidentGenotypeCalls = new HashSet<GenomeLoc>();
|
||||
concordantHomCalls = new HashSet<GenomeLoc>();
|
||||
concordantHetCalls = new HashSet<GenomeLoc>();
|
||||
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
|
||||
chipNoCalls = new HashSet<GenomeLoc>();
|
||||
ignoredDueToDepth = new HashSet<GenomeLoc>();
|
||||
falseNegativeLoci = 0;
|
||||
falseNegativeLociDueToNoCall = 0;
|
||||
falsePositiveLoci = 0;
|
||||
falseNegativeLociDueToFilters = 0;
|
||||
hetsCalledHoms = 0;
|
||||
homsCalledHets = 0;
|
||||
nonConfidentGenotypeCalls = 0;
|
||||
concordantHomCalls = 0;
|
||||
concordantHetCalls = 0;
|
||||
concordantGenotypeReferenceCalls = 0;
|
||||
chipNoCalls = 0;
|
||||
ignoredDueToDepth = 0;
|
||||
minimumDepthForUpdate = minimumDepth;
|
||||
minimumGenotypeQuality = minGenQual;
|
||||
}
|
||||
|
|
@ -57,37 +57,37 @@ class VCFConcordanceCalculator {
|
|||
|
||||
public void updateTruthOnly(LocusConcordanceInfo info) {
|
||||
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) {
|
||||
falseNegativeLoci.add(info.getLoc());
|
||||
falseNegativeLoci++;
|
||||
} else {
|
||||
concordantGenotypeReferenceCalls.add(info.getLoc());
|
||||
concordantGenotypeReferenceCalls++;
|
||||
}
|
||||
}
|
||||
|
||||
public void updateFilteredLocus(LocusConcordanceInfo info) {
|
||||
|
||||
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase()) ) {
|
||||
falseNegativeLociDueToFilters.add(info.getLoc());
|
||||
falseNegativeLociDueToFilters++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),
|
||||
concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),nonConfidentGenotypeCalls.size(),
|
||||
homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),
|
||||
falseNegativeLociDueToNoCall.size(),falseNegativeLociDueToFilters.size());
|
||||
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth,
|
||||
concordantGenotypeReferenceCalls,concordantHomCalls,concordantHetCalls,nonConfidentGenotypeCalls,
|
||||
homsCalledHets,hetsCalledHoms,falsePositiveLoci,falseNegativeLoci,
|
||||
falseNegativeLociDueToNoCall,falseNegativeLociDueToFilters);
|
||||
}
|
||||
|
||||
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
|
||||
if ( minimumDepthForUpdate > 0 && call.getReadCount() < minimumDepthForUpdate ) {
|
||||
ignoredDueToDepth.add(loc);
|
||||
ignoredDueToDepth++;
|
||||
} else if ( truth.isNoCall() ) {
|
||||
chipNoCalls.add(loc);
|
||||
chipNoCalls++;
|
||||
} else if ( truth.isVariant(( char) ref) ) {
|
||||
if ( call.isNoCall() ) {
|
||||
falseNegativeLociDueToNoCall.add(loc);
|
||||
falseNegativeLociDueToNoCall++;
|
||||
} else if ( ! call.isVariant( (char) ref ) ) {
|
||||
falseNegativeLoci.add(loc);
|
||||
falseNegativeLoci++;
|
||||
} else if ( call.isVariant((char) ref) ) {
|
||||
// check het vs hom
|
||||
checkGenotypeCall(truth,call, loc);
|
||||
|
|
@ -96,9 +96,9 @@ class VCFConcordanceCalculator {
|
|||
} else if ( ! truth.isVariant( (char) ref ) ) {
|
||||
|
||||
if ( call.isVariant((char) ref) ) {
|
||||
falsePositiveLoci.add(loc);
|
||||
falsePositiveLoci++;
|
||||
} else {
|
||||
concordantGenotypeReferenceCalls.add(loc);
|
||||
concordantGenotypeReferenceCalls++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -107,18 +107,18 @@ class VCFConcordanceCalculator {
|
|||
if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) {
|
||||
|
||||
if ( truth.isHet() && call.isHom() ) {
|
||||
hetsCalledHoms.add(loc);
|
||||
hetsCalledHoms++;
|
||||
} else if ( truth.isHom() && call.isHet() ) {
|
||||
homsCalledHets.add(loc);
|
||||
homsCalledHets++;
|
||||
} else if ( ( truth.isHet() && call.isHet() ) ) {
|
||||
concordantHetCalls.add(loc);
|
||||
concordantHetCalls++;
|
||||
} else if ( truth.isHom() && call.isHom() ) { // be extra careful
|
||||
concordantHomCalls.add(loc);
|
||||
concordantHomCalls++;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
nonConfidentGenotypeCalls.add(loc);
|
||||
nonConfidentGenotypeCalls++;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,10 @@ import net.sf.samtools.SAMReadGroupRecord;
|
|||
|
||||
import cern.jet.stat.Probability;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
|
||||
* to be contaminants (meaning a read group that's not actually associated with the sample) by searching
|
||||
|
|
@ -28,8 +32,14 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
|
||||
private Double LIMIT = 1e-9;
|
||||
|
||||
@Argument(fullName="scaleForSample", shortName="scale", doc="the scale by which the pvalue limit should reduce for testing samples directly. "+
|
||||
"E.g. if a sample has three 1e-3 read groups, pvalue is 1e-9 -- significant; so the scale should reduce by some multiplicative factor"+
|
||||
"For each read group associated with the sample. Defaults to 1e-4 [1e-9 for 1 RG, 1e-13 for 2 RG, 1e-17 for 3, etc]", required=false)
|
||||
private Double SCALE = 1e-4;
|
||||
|
||||
private UnifiedGenotyperEngine ug;
|
||||
private NamedTable altTable;
|
||||
private final double EPSILON = 1e-20;
|
||||
|
||||
public void initialize() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
|
@ -140,6 +150,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
//out.println("readgroup\tpvalue\tstatus\tbalances");
|
||||
out.printf("%-10s\t%-13s\t%-10s\t%-10s%n", "readgroup", "pvalue", "status", "balances");
|
||||
|
||||
HashMap<String,Double> pvalByReadGroup = new HashMap<String,Double>();
|
||||
for (String rg : altTable.getRowNames()) {
|
||||
String balances = "";
|
||||
|
||||
|
|
@ -179,6 +190,8 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
|
||||
// Compute pValue
|
||||
double pValue = Probability.studentT(dof, t);
|
||||
pValue = pValue < EPSILON ? EPSILON : pValue;
|
||||
pvalByReadGroup.put(rg,pValue);
|
||||
|
||||
//out.printf("%s\t%e\t%s\t[%s]\n", rg, pValue, (pValue < LIMIT ? "aberrant" : "nominal"), balances);
|
||||
out.printf("%-10s\t%-13s\t%-10s\t[%-10s]\n",
|
||||
|
|
@ -186,6 +199,37 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
String.format("%e", pValue),
|
||||
(pValue < LIMIT ? "aberrant" : "nominal"),
|
||||
balances);
|
||||
|
||||
logger.debug(rg);
|
||||
}
|
||||
|
||||
out.printf("%n%n%s%n","SECTION ON BADLY CONTAMINATED SAMPLES");
|
||||
out.printf("%s\t%s\t%s\t%s%n","sample","p-value","status","info");
|
||||
|
||||
HashMap<String,List<String>> samplesToReadGroups = new HashMap<String,List<String>>();
|
||||
for ( SAMReadGroupRecord rec : getToolkit().getSAMFileHeader().getReadGroups() ) {
|
||||
if ( samplesToReadGroups.containsKey(rec.getSample()) ) {
|
||||
samplesToReadGroups.get(rec.getSample()).add(rec.getReadGroupId());
|
||||
} else {
|
||||
ArrayList<String> newList = new ArrayList<String>();
|
||||
newList.add(rec.getReadGroupId());
|
||||
samplesToReadGroups.put(rec.getSample(),newList);
|
||||
}
|
||||
}
|
||||
|
||||
for ( String sample : samplesToReadGroups.keySet() ) {
|
||||
double p_value = 1;
|
||||
double limit = LIMIT;
|
||||
boolean containsAberrantReads = false;
|
||||
for ( String rg : samplesToReadGroups.get(sample) ) {
|
||||
double rg_pval = ( pvalByReadGroup.get(rg) == null ? 1 : pvalByReadGroup.get(rg) );
|
||||
p_value = p_value*rg_pval;
|
||||
containsAberrantReads = containsAberrantReads || rg_pval < LIMIT;
|
||||
limit = limit*SCALE;
|
||||
logger.debug(rg);
|
||||
}
|
||||
|
||||
out.printf("%s\t%-13s\t%s\t%s%n", sample, String.format("%e",p_value), ( p_value < limit ? "aberrant" : "nominal"), ( containsAberrantReads ? "contains_aberrant_RG" : "no_aberrant_RG"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue