Initial commit of tools under development for data QC through firehose.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2834 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
77af5822d4
commit
04a2784bf7
|
|
@ -0,0 +1,24 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.filters;
|
||||||
|
|
||||||
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||||
|
*
|
||||||
|
* @Author chartl
|
||||||
|
* @Date Feb 8, 2010
|
||||||
|
*/
|
||||||
|
public class ContaminatedSampleFilter implements SamRecordFilter {
|
||||||
|
|
||||||
|
private final String[] filteredNames = {"NA19562","NA19006","NA19554","NA18985","NA18988","NA19062","NA19559"};
|
||||||
|
|
||||||
|
private HashSet<String> contaminatedSamples = new HashSet<String>( Arrays.asList(filteredNames) );
|
||||||
|
|
||||||
|
public boolean filterOut(SAMRecord read) {
|
||||||
|
return contaminatedSamples.contains(read.getReadGroup().getSample());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||||
|
*
|
||||||
|
* @Author chartl
|
||||||
|
* @Date Feb 12, 2010
|
||||||
|
*/
|
||||||
|
public class DepthStatisticsCalculator extends SummaryStatisticsCalculator {
|
||||||
|
private int numBasesAbove0x; // % of bases with ANY coverage
|
||||||
|
private int numBasesAbove3x; // % of bases with 4x+
|
||||||
|
private int numBasesAbove9x; // % of bases with 10x+
|
||||||
|
private int numBasesAbove24x; // % of bases with 25x+
|
||||||
|
private int numBasesAbove49x; // % of bases with 50x+
|
||||||
|
private int numBasesAbove99x; // % of bases with 100x+
|
||||||
|
|
||||||
|
private int targetsAbove0x;
|
||||||
|
private int targetsAbove3x;
|
||||||
|
private int targetsAbove9x;
|
||||||
|
private int targetsAbove24x;
|
||||||
|
private int targetsAbove49x;
|
||||||
|
private int targetsAbove99x;
|
||||||
|
private int numTargets;
|
||||||
|
|
||||||
|
public static double[] DEPTH_CUTOFFS = {1,4,10,25,50,100};
|
||||||
|
|
||||||
|
public DepthStatisticsCalculator(String name) {
|
||||||
|
super(name);
|
||||||
|
numBasesAbove0x = 0;
|
||||||
|
numBasesAbove3x = 0;
|
||||||
|
numBasesAbove9x = 0;
|
||||||
|
numBasesAbove24x = 0;
|
||||||
|
numBasesAbove49x = 0;
|
||||||
|
numBasesAbove99x = 0;
|
||||||
|
targetsAbove99x = 0;
|
||||||
|
targetsAbove49x = 0;
|
||||||
|
targetsAbove24x = 0;
|
||||||
|
targetsAbove9x = 0;
|
||||||
|
targetsAbove3x = 0;
|
||||||
|
targetsAbove0x = 0;
|
||||||
|
numTargets = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateLocus(int depth) {
|
||||||
|
super.update(depth);
|
||||||
|
|
||||||
|
if ( depth > 99 ) {
|
||||||
|
numBasesAbove99x++;
|
||||||
|
}
|
||||||
|
if ( depth > 49 ) {
|
||||||
|
numBasesAbove49x++;
|
||||||
|
}
|
||||||
|
if ( depth > 24 ) {
|
||||||
|
numBasesAbove24x++;
|
||||||
|
}
|
||||||
|
if ( depth > 9 ) {
|
||||||
|
numBasesAbove9x++;
|
||||||
|
}
|
||||||
|
if ( depth > 3 ) {
|
||||||
|
numBasesAbove3x++;
|
||||||
|
}
|
||||||
|
if ( depth > 0 ) {
|
||||||
|
numBasesAbove0x++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateTargets(int targetLength, int totalCoverage) {
|
||||||
|
double avgCvg = ( (double) totalCoverage ) / ( (double) targetLength);
|
||||||
|
|
||||||
|
if ( avgCvg >= 100 ) {
|
||||||
|
targetsAbove99x ++;
|
||||||
|
}
|
||||||
|
if ( avgCvg >= 50 ) {
|
||||||
|
targetsAbove49x++;
|
||||||
|
}
|
||||||
|
if ( avgCvg >= 25 ) {
|
||||||
|
targetsAbove24x++;
|
||||||
|
}
|
||||||
|
if ( avgCvg >= 10 ) {
|
||||||
|
targetsAbove9x++;
|
||||||
|
}
|
||||||
|
if ( avgCvg >= 4 ) {
|
||||||
|
targetsAbove3x++;
|
||||||
|
}
|
||||||
|
if ( avgCvg >= 1 ) {
|
||||||
|
targetsAbove0x++;
|
||||||
|
}
|
||||||
|
|
||||||
|
numTargets++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double[] getLocusProportions() {
|
||||||
|
double[] proportions = new double[6];
|
||||||
|
|
||||||
|
proportions[0] = ( (double) numBasesAbove0x )/( (double) getSampleSize() );
|
||||||
|
proportions[2] = ( (double) numBasesAbove9x )/( (double) getSampleSize() );
|
||||||
|
proportions[3] = ( (double) numBasesAbove24x )/( (double) getSampleSize() );
|
||||||
|
proportions[4] = ( (double) numBasesAbove49x )/( (double) getSampleSize() );
|
||||||
|
proportions[5] = ( (double) numBasesAbove99x )/( (double) getSampleSize() );
|
||||||
|
proportions[1] = ( (double) numBasesAbove3x )/( (double) getSampleSize() );
|
||||||
|
|
||||||
|
return proportions;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double[] getTargetProportions() {
|
||||||
|
double[] proportions = new double[6];
|
||||||
|
|
||||||
|
proportions[0] = ( (double) targetsAbove0x )/( (double) numTargets );
|
||||||
|
proportions[2] = ( (double) targetsAbove9x )/( (double) numTargets );
|
||||||
|
proportions[3] = ( (double) targetsAbove24x )/( (double) numTargets );
|
||||||
|
proportions[4] = ( (double) targetsAbove49x )/( (double) numTargets );
|
||||||
|
proportions[5] = ( (double) targetsAbove99x )/( (double) numTargets );
|
||||||
|
proportions[1] = ( (double) targetsAbove3x )/( (double) numTargets );
|
||||||
|
|
||||||
|
return proportions;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getPercentWellCoveredLoci() {
|
||||||
|
return 10*( (double) numBasesAbove9x )/( (double) getSampleSize() );
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getPercentWellCoveredTargets() {
|
||||||
|
return 10*( (double) targetsAbove9x )/( (double) numTargets );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,257 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||||
|
*
|
||||||
|
* @Author chartl
|
||||||
|
* @Date Feb 11, 2010
|
||||||
|
*/
|
||||||
|
|
||||||
|
class FirehoseSummaryCLP extends CommandLineProgram {
|
||||||
|
@Argument(fullName = "depthOfCoverageFile", shortName = "doc", doc="Path to the depth of coverage file", required=true)
|
||||||
|
private File depthOfCoverage = null;
|
||||||
|
@Argument(fullName = "contaminationFile", shortName = "con", doc="Path to the contamination file", required=true)
|
||||||
|
private File contamination = null;
|
||||||
|
@Argument(fullName = "errorRateFile", shortName = "err", doc="Path to the error rate file", required=true)
|
||||||
|
private File errorRate = null;
|
||||||
|
@Argument(fullName = "zipFiles", shortName = "zip", doc="List of paths to zip files which contain summary metrics files", required=false)
|
||||||
|
private String zipFiles = null;
|
||||||
|
|
||||||
|
private static String R_SCRIPT = "plotFirehoseDataQCMetrics.R";
|
||||||
|
private static String SCRIPT_DOC_FLAG = "DOC";
|
||||||
|
|
||||||
|
protected int execute() {
|
||||||
|
SummaryFileCollection metricsFiles = getFileHandles();
|
||||||
|
List<DepthStatisticsCalculator> depthStats = calculateDepthStatistics(depthOfCoverage);
|
||||||
|
String docSummary = makeDOCPlots(depthStats);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String makeDOCPlots(List<DepthStatisticsCalculator> calcs) {
|
||||||
|
StringBuilder summaryBuilder = new StringBuilder();
|
||||||
|
int numSamplesWithGoodBaseCoverage=0;
|
||||||
|
int numSamplesWithGoodMeanCoverage = 0;
|
||||||
|
int numSamplesWithGoodTargetCoverage=0;
|
||||||
|
double aggregateMeanCoverage = -1d;
|
||||||
|
double aggregateStdDevCoverage = -1d;
|
||||||
|
double aggregateSkewCoverage = -1d;
|
||||||
|
PrintWriter locusOutput;
|
||||||
|
PrintWriter targetOutput;
|
||||||
|
File locusOutputFile;
|
||||||
|
File targetOutputFile;
|
||||||
|
try {
|
||||||
|
locusOutputFile = File.createTempFile("locus_output","txt");
|
||||||
|
locusOutput = new PrintWriter(locusOutputFile);
|
||||||
|
targetOutputFile = File.createTempFile("target_output","txt");
|
||||||
|
targetOutput = new PrintWriter(targetOutputFile);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new StingException("Error creating temporary output files for R-plotting. Perhaps user permissions do not allow creation of files.",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||||
|
|
||||||
|
if ( calc.getName().equalsIgnoreCase("total_coverage") || calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||||
|
// update summary statistics
|
||||||
|
if ( calc.getName().equalsIgnoreCase("total_coverage") ) {
|
||||||
|
aggregateMeanCoverage = calc.getMean();
|
||||||
|
aggregateStdDevCoverage = Math.sqrt(calc.getVar());
|
||||||
|
aggregateSkewCoverage = calc.getSkew();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// update sample-based summary statistics and print output to file
|
||||||
|
locusOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n",calc.getName(),calc.getLocusProportions());
|
||||||
|
targetOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n", calc.getName(),calc.getTargetProportions());
|
||||||
|
if ( calc.getPercentWellCoveredLoci() > 80 ) {
|
||||||
|
numSamplesWithGoodBaseCoverage++;
|
||||||
|
}
|
||||||
|
if ( calc.getPercentWellCoveredTargets() > 90 ) {
|
||||||
|
numSamplesWithGoodTargetCoverage++;
|
||||||
|
}
|
||||||
|
if ( calc.getMean() > 10 ) {
|
||||||
|
numSamplesWithGoodMeanCoverage++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfBasesCoveredBySample",locusOutputFile.getAbsolutePath());
|
||||||
|
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfTargetsCoveredBySample",targetOutputFile.getAbsolutePath());
|
||||||
|
return "temporary";
|
||||||
|
}
|
||||||
|
|
||||||
|
private SummaryFileCollection getFileHandles() {
|
||||||
|
if ( zipFiles == null ) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
SummaryFileCollection summaryFiles = new SummaryFileCollection();
|
||||||
|
for ( String zipFile : zipFiles.split(",") ) {
|
||||||
|
summaryFiles.process(zipFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
return summaryFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<DepthStatisticsCalculator> calculateDepthStatistics(File docFile) {
|
||||||
|
BufferedReader docReader;
|
||||||
|
|
||||||
|
try {
|
||||||
|
docReader = new BufferedReader( new FileReader(docFile) );
|
||||||
|
} catch ( IOException e) {
|
||||||
|
throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
String locusHeader = getDOCSectionHeader(docReader);
|
||||||
|
List<DepthStatisticsCalculator> docCalculators = instantiateDOCCalculators(locusHeader);
|
||||||
|
|
||||||
|
updateLocusInfo(docCalculators,docReader);
|
||||||
|
|
||||||
|
String targetHeader = getDOCSectionHeader(docReader);
|
||||||
|
|
||||||
|
updateTargetInfo(docCalculators,docReader);
|
||||||
|
|
||||||
|
return docCalculators;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateLocusInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||||
|
|
||||||
|
String docLocLine;
|
||||||
|
try {
|
||||||
|
docLocLine = reader.readLine();
|
||||||
|
while ( ! isEndOfSection(docLocLine) ) {
|
||||||
|
int offset = -1;
|
||||||
|
for ( String entry : docLocLine.split("\t") ) {
|
||||||
|
if ( offset > -1 ) {
|
||||||
|
calcs.get(offset).updateLocus(Integer.parseInt(entry));
|
||||||
|
}
|
||||||
|
offset++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( IOException e) {
|
||||||
|
throw new StingException("Error reading locus depth of coverage information",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateTargetInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||||
|
|
||||||
|
String docLocLine;
|
||||||
|
try {
|
||||||
|
docLocLine = reader.readLine();
|
||||||
|
while ( ! isEndOfSection(docLocLine) ) {
|
||||||
|
int offset = -1;
|
||||||
|
int targetSize = 0;
|
||||||
|
for ( String entry : docLocLine.split("\t") ) {
|
||||||
|
if ( offset == -1 ) {
|
||||||
|
targetSize = parseInterval(entry);
|
||||||
|
} else {
|
||||||
|
calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry));
|
||||||
|
}
|
||||||
|
offset++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new StingException("Error reading target depth of coverage information",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private int parseInterval(String interval) {
|
||||||
|
String startstop = interval.split(":")[1];
|
||||||
|
int start = Integer.parseInt(startstop.split("-")[0]);
|
||||||
|
int stop = Integer.parseInt(startstop.split("-")[1]);
|
||||||
|
return stop - start;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isDOCSectionSeparator( String line ) {
|
||||||
|
return line.contains("_COVERAGE_SECTION");
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isEndOfSection( String line ) {
|
||||||
|
// sections delimited by empty line
|
||||||
|
return line.equalsIgnoreCase("");
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getDOCSectionHeader(BufferedReader reader) {
|
||||||
|
String header;
|
||||||
|
try {
|
||||||
|
do {
|
||||||
|
header = reader.readLine();
|
||||||
|
} while ( ! isDOCSectionSeparator(header) );
|
||||||
|
|
||||||
|
header = reader.readLine();
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new StingException("Error reading depth of coverage file",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return header;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<DepthStatisticsCalculator> instantiateDOCCalculators(String header) {
|
||||||
|
List<DepthStatisticsCalculator> calcs = new ArrayList<DepthStatisticsCalculator>();
|
||||||
|
|
||||||
|
int offset = -1;
|
||||||
|
for ( String entry : header.split("\t") ) {
|
||||||
|
if ( offset > -1 ) {
|
||||||
|
calcs.add(new DepthStatisticsCalculator(entry));
|
||||||
|
}
|
||||||
|
offset++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return calcs;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public class GenerateFirehoseSummary {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
FirehoseSummaryCLP clp = new FirehoseSummaryCLP();
|
||||||
|
CommandLineProgram.start(clp, args);
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SummaryFileCollection {
|
||||||
|
|
||||||
|
// container class for files we'll be summarizing
|
||||||
|
|
||||||
|
public Map<String,File> fingerprintSummaryFiles;
|
||||||
|
public Map<String,File> hybridSelectionMetricsFiles;
|
||||||
|
public Map<String,File> insertSizeDistributionFiles;
|
||||||
|
public Map<String,File> alignmentMetricsFiles;
|
||||||
|
|
||||||
|
public SummaryFileCollection() {
|
||||||
|
fingerprintSummaryFiles = new HashMap<String,File>();
|
||||||
|
hybridSelectionMetricsFiles = new HashMap<String, File>();
|
||||||
|
insertSizeDistributionFiles = new HashMap<String,File>();
|
||||||
|
alignmentMetricsFiles = new HashMap<String,File>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void process(String zipFilePath) {
|
||||||
|
String sampleName = zipFilePath.split("_sequencing_metrics.zip")[0].split("_")[1];
|
||||||
|
File fingerprintSummaryFile = new File(sampleName+".summary_fingerprint_metrics");
|
||||||
|
File hybridSelectionFile = new File(sampleName+".hybrid_selection_metrics");
|
||||||
|
File insertSizeFile = new File(sampleName+".insert_size_metrics");
|
||||||
|
File alignmentFile = new File(sampleName+".alignment_metrics");
|
||||||
|
|
||||||
|
String command = "unzip "+zipFilePath;
|
||||||
|
try {
|
||||||
|
Process p = Runtime.getRuntime().exec(command);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Could not unzip the file "+zipFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
fingerprintSummaryFiles.put(sampleName,fingerprintSummaryFile);
|
||||||
|
hybridSelectionMetricsFiles.put(sampleName,hybridSelectionFile);
|
||||||
|
insertSizeDistributionFiles.put(sampleName,insertSizeFile);
|
||||||
|
alignmentMetricsFiles.put(sampleName,alignmentFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This object calculates the first three sample moments of a data set on-the-fly
|
||||||
|
* @Author chartl
|
||||||
|
* @Date Feb 12, 2010
|
||||||
|
*/
|
||||||
|
public class SummaryStatisticsCalculator {
|
||||||
|
// todo -- can median / quantiles be estimated on-the-fly?
|
||||||
|
// todo -- can we divine a metric for modality?
|
||||||
|
private double mean; // mean of the samples fed to it
|
||||||
|
private double var; // variance of the samples fed to it
|
||||||
|
private double skew; // skew of the samples fed to it
|
||||||
|
private int sampleSize; // number of samples given to calculator
|
||||||
|
private String name; // name to associate calculator with, if any
|
||||||
|
|
||||||
|
public SummaryStatisticsCalculator() {
|
||||||
|
mean = 0;
|
||||||
|
var = 0;
|
||||||
|
skew = 0;
|
||||||
|
sampleSize = 0;
|
||||||
|
name = "SUMMARY";
|
||||||
|
}
|
||||||
|
|
||||||
|
public SummaryStatisticsCalculator(String name) {
|
||||||
|
mean = 0;
|
||||||
|
var = 0;
|
||||||
|
skew = 0;
|
||||||
|
sampleSize = 0;
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(double datum) {
|
||||||
|
mean = (sampleSize*mean + datum)/(sampleSize + 1);
|
||||||
|
var = (sampleSize*var + Math.pow( mean - datum , 2 ) )/( sampleSize + 1);
|
||||||
|
// note: the variance is not re-calculated over all data points each time the mean changes
|
||||||
|
// this means the convergence will be slower than x^-(1/2); but it will still converge with the mean
|
||||||
|
// whole exome is ~3.3 million datapoints, which will be plenty for this to converge to within a very
|
||||||
|
// small proportion of the true sample variance
|
||||||
|
skew = ( skew*Math.pow(var,2/3)*sampleSize+ Math.pow( datum - mean, 3) ) / ( ( sampleSize + 1 )* Math.pow(var,2/3) );
|
||||||
|
// again, new mean/variance estimates are not propagated over all previous data points, so convergence is a bit slower
|
||||||
|
// but that's the price one pays for on-the-fly calculation
|
||||||
|
sampleSize++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(int datum) {
|
||||||
|
update( (double) datum);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getMean() {
|
||||||
|
return mean;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getVar() {
|
||||||
|
return var;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getSkew() {
|
||||||
|
return skew;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSampleSize() {
|
||||||
|
return sampleSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -20,33 +20,33 @@ class VCFConcordanceCalculator {
|
||||||
private int minimumDepthForUpdate;
|
private int minimumDepthForUpdate;
|
||||||
private int minimumGenotypeQuality;
|
private int minimumGenotypeQuality;
|
||||||
private String name;
|
private String name;
|
||||||
private Set<GenomeLoc> falsePositiveLoci;
|
private int falsePositiveLoci;
|
||||||
private Set<GenomeLoc> falseNegativeLoci;
|
private int falseNegativeLoci;
|
||||||
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
|
private int falseNegativeLociDueToNoCall;
|
||||||
private Set<GenomeLoc> falseNegativeLociDueToFilters;
|
private int falseNegativeLociDueToFilters;
|
||||||
private Set<GenomeLoc> hetsCalledHoms;
|
private int hetsCalledHoms;
|
||||||
private Set<GenomeLoc> homsCalledHets;
|
private int homsCalledHets;
|
||||||
private Set<GenomeLoc> nonConfidentGenotypeCalls;
|
private int nonConfidentGenotypeCalls;
|
||||||
private Set<GenomeLoc> concordantHomCalls;
|
private int concordantHomCalls;
|
||||||
private Set<GenomeLoc> concordantHetCalls;
|
private int concordantHetCalls;
|
||||||
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
|
private int concordantGenotypeReferenceCalls;
|
||||||
private Set<GenomeLoc> chipNoCalls;
|
private int chipNoCalls;
|
||||||
private Set<GenomeLoc> ignoredDueToDepth;
|
private int ignoredDueToDepth;
|
||||||
|
|
||||||
public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) {
|
public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) {
|
||||||
name = sampleName;
|
name = sampleName;
|
||||||
falseNegativeLoci = new HashSet<GenomeLoc>();
|
falseNegativeLoci = 0;
|
||||||
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
|
falseNegativeLociDueToNoCall = 0;
|
||||||
falsePositiveLoci = new HashSet<GenomeLoc>();
|
falsePositiveLoci = 0;
|
||||||
falseNegativeLociDueToFilters = new HashSet<GenomeLoc>();
|
falseNegativeLociDueToFilters = 0;
|
||||||
hetsCalledHoms = new HashSet<GenomeLoc>();
|
hetsCalledHoms = 0;
|
||||||
homsCalledHets = new HashSet<GenomeLoc>();
|
homsCalledHets = 0;
|
||||||
nonConfidentGenotypeCalls = new HashSet<GenomeLoc>();
|
nonConfidentGenotypeCalls = 0;
|
||||||
concordantHomCalls = new HashSet<GenomeLoc>();
|
concordantHomCalls = 0;
|
||||||
concordantHetCalls = new HashSet<GenomeLoc>();
|
concordantHetCalls = 0;
|
||||||
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
|
concordantGenotypeReferenceCalls = 0;
|
||||||
chipNoCalls = new HashSet<GenomeLoc>();
|
chipNoCalls = 0;
|
||||||
ignoredDueToDepth = new HashSet<GenomeLoc>();
|
ignoredDueToDepth = 0;
|
||||||
minimumDepthForUpdate = minimumDepth;
|
minimumDepthForUpdate = minimumDepth;
|
||||||
minimumGenotypeQuality = minGenQual;
|
minimumGenotypeQuality = minGenQual;
|
||||||
}
|
}
|
||||||
|
|
@ -57,37 +57,37 @@ class VCFConcordanceCalculator {
|
||||||
|
|
||||||
public void updateTruthOnly(LocusConcordanceInfo info) {
|
public void updateTruthOnly(LocusConcordanceInfo info) {
|
||||||
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) {
|
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) {
|
||||||
falseNegativeLoci.add(info.getLoc());
|
falseNegativeLoci++;
|
||||||
} else {
|
} else {
|
||||||
concordantGenotypeReferenceCalls.add(info.getLoc());
|
concordantGenotypeReferenceCalls++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateFilteredLocus(LocusConcordanceInfo info) {
|
public void updateFilteredLocus(LocusConcordanceInfo info) {
|
||||||
|
|
||||||
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase()) ) {
|
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase()) ) {
|
||||||
falseNegativeLociDueToFilters.add(info.getLoc());
|
falseNegativeLociDueToFilters++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),
|
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth,
|
||||||
concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),nonConfidentGenotypeCalls.size(),
|
concordantGenotypeReferenceCalls,concordantHomCalls,concordantHetCalls,nonConfidentGenotypeCalls,
|
||||||
homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),
|
homsCalledHets,hetsCalledHoms,falsePositiveLoci,falseNegativeLoci,
|
||||||
falseNegativeLociDueToNoCall.size(),falseNegativeLociDueToFilters.size());
|
falseNegativeLociDueToNoCall,falseNegativeLociDueToFilters);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
|
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
|
||||||
if ( minimumDepthForUpdate > 0 && call.getReadCount() < minimumDepthForUpdate ) {
|
if ( minimumDepthForUpdate > 0 && call.getReadCount() < minimumDepthForUpdate ) {
|
||||||
ignoredDueToDepth.add(loc);
|
ignoredDueToDepth++;
|
||||||
} else if ( truth.isNoCall() ) {
|
} else if ( truth.isNoCall() ) {
|
||||||
chipNoCalls.add(loc);
|
chipNoCalls++;
|
||||||
} else if ( truth.isVariant(( char) ref) ) {
|
} else if ( truth.isVariant(( char) ref) ) {
|
||||||
if ( call.isNoCall() ) {
|
if ( call.isNoCall() ) {
|
||||||
falseNegativeLociDueToNoCall.add(loc);
|
falseNegativeLociDueToNoCall++;
|
||||||
} else if ( ! call.isVariant( (char) ref ) ) {
|
} else if ( ! call.isVariant( (char) ref ) ) {
|
||||||
falseNegativeLoci.add(loc);
|
falseNegativeLoci++;
|
||||||
} else if ( call.isVariant((char) ref) ) {
|
} else if ( call.isVariant((char) ref) ) {
|
||||||
// check het vs hom
|
// check het vs hom
|
||||||
checkGenotypeCall(truth,call, loc);
|
checkGenotypeCall(truth,call, loc);
|
||||||
|
|
@ -96,9 +96,9 @@ class VCFConcordanceCalculator {
|
||||||
} else if ( ! truth.isVariant( (char) ref ) ) {
|
} else if ( ! truth.isVariant( (char) ref ) ) {
|
||||||
|
|
||||||
if ( call.isVariant((char) ref) ) {
|
if ( call.isVariant((char) ref) ) {
|
||||||
falsePositiveLoci.add(loc);
|
falsePositiveLoci++;
|
||||||
} else {
|
} else {
|
||||||
concordantGenotypeReferenceCalls.add(loc);
|
concordantGenotypeReferenceCalls++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -107,18 +107,18 @@ class VCFConcordanceCalculator {
|
||||||
if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) {
|
if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) {
|
||||||
|
|
||||||
if ( truth.isHet() && call.isHom() ) {
|
if ( truth.isHet() && call.isHom() ) {
|
||||||
hetsCalledHoms.add(loc);
|
hetsCalledHoms++;
|
||||||
} else if ( truth.isHom() && call.isHet() ) {
|
} else if ( truth.isHom() && call.isHet() ) {
|
||||||
homsCalledHets.add(loc);
|
homsCalledHets++;
|
||||||
} else if ( ( truth.isHet() && call.isHet() ) ) {
|
} else if ( ( truth.isHet() && call.isHet() ) ) {
|
||||||
concordantHetCalls.add(loc);
|
concordantHetCalls++;
|
||||||
} else if ( truth.isHom() && call.isHom() ) { // be extra careful
|
} else if ( truth.isHom() && call.isHom() ) { // be extra careful
|
||||||
concordantHomCalls.add(loc);
|
concordantHomCalls++;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
nonConfidentGenotypeCalls.add(loc);
|
nonConfidentGenotypeCalls++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,10 @@ import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
|
||||||
import cern.jet.stat.Probability;
|
import cern.jet.stat.Probability;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
|
* FindContaminatingReadGroupsWalker lists read groups in a single-sample BAM file that appear
|
||||||
* to be contaminants (meaning a read group that's not actually associated with the sample) by searching
|
* to be contaminants (meaning a read group that's not actually associated with the sample) by searching
|
||||||
|
|
@ -28,8 +32,14 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
||||||
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
|
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
|
||||||
private Double LIMIT = 1e-9;
|
private Double LIMIT = 1e-9;
|
||||||
|
|
||||||
|
@Argument(fullName="scaleForSample", shortName="scale", doc="the scale by which the pvalue limit should reduce for testing samples directly. "+
|
||||||
|
"E.g. if a sample has three 1e-3 read groups, pvalue is 1e-9 -- significant; so the scale should reduce by some multiplicative factor"+
|
||||||
|
"For each read group associated with the sample. Defaults to 1e-4 [1e-9 for 1 RG, 1e-13 for 2 RG, 1e-17 for 3, etc]", required=false)
|
||||||
|
private Double SCALE = 1e-4;
|
||||||
|
|
||||||
private UnifiedGenotyperEngine ug;
|
private UnifiedGenotyperEngine ug;
|
||||||
private NamedTable altTable;
|
private NamedTable altTable;
|
||||||
|
private final double EPSILON = 1e-20;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||||
|
|
@ -140,6 +150,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
||||||
//out.println("readgroup\tpvalue\tstatus\tbalances");
|
//out.println("readgroup\tpvalue\tstatus\tbalances");
|
||||||
out.printf("%-10s\t%-13s\t%-10s\t%-10s%n", "readgroup", "pvalue", "status", "balances");
|
out.printf("%-10s\t%-13s\t%-10s\t%-10s%n", "readgroup", "pvalue", "status", "balances");
|
||||||
|
|
||||||
|
HashMap<String,Double> pvalByReadGroup = new HashMap<String,Double>();
|
||||||
for (String rg : altTable.getRowNames()) {
|
for (String rg : altTable.getRowNames()) {
|
||||||
String balances = "";
|
String balances = "";
|
||||||
|
|
||||||
|
|
@ -179,6 +190,8 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
||||||
|
|
||||||
// Compute pValue
|
// Compute pValue
|
||||||
double pValue = Probability.studentT(dof, t);
|
double pValue = Probability.studentT(dof, t);
|
||||||
|
pValue = pValue < EPSILON ? EPSILON : pValue;
|
||||||
|
pvalByReadGroup.put(rg,pValue);
|
||||||
|
|
||||||
//out.printf("%s\t%e\t%s\t[%s]\n", rg, pValue, (pValue < LIMIT ? "aberrant" : "nominal"), balances);
|
//out.printf("%s\t%e\t%s\t[%s]\n", rg, pValue, (pValue < LIMIT ? "aberrant" : "nominal"), balances);
|
||||||
out.printf("%-10s\t%-13s\t%-10s\t[%-10s]\n",
|
out.printf("%-10s\t%-13s\t%-10s\t[%-10s]\n",
|
||||||
|
|
@ -186,6 +199,37 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
||||||
String.format("%e", pValue),
|
String.format("%e", pValue),
|
||||||
(pValue < LIMIT ? "aberrant" : "nominal"),
|
(pValue < LIMIT ? "aberrant" : "nominal"),
|
||||||
balances);
|
balances);
|
||||||
|
|
||||||
|
logger.debug(rg);
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("%n%n%s%n","SECTION ON BADLY CONTAMINATED SAMPLES");
|
||||||
|
out.printf("%s\t%s\t%s\t%s%n","sample","p-value","status","info");
|
||||||
|
|
||||||
|
HashMap<String,List<String>> samplesToReadGroups = new HashMap<String,List<String>>();
|
||||||
|
for ( SAMReadGroupRecord rec : getToolkit().getSAMFileHeader().getReadGroups() ) {
|
||||||
|
if ( samplesToReadGroups.containsKey(rec.getSample()) ) {
|
||||||
|
samplesToReadGroups.get(rec.getSample()).add(rec.getReadGroupId());
|
||||||
|
} else {
|
||||||
|
ArrayList<String> newList = new ArrayList<String>();
|
||||||
|
newList.add(rec.getReadGroupId());
|
||||||
|
samplesToReadGroups.put(rec.getSample(),newList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( String sample : samplesToReadGroups.keySet() ) {
|
||||||
|
double p_value = 1;
|
||||||
|
double limit = LIMIT;
|
||||||
|
boolean containsAberrantReads = false;
|
||||||
|
for ( String rg : samplesToReadGroups.get(sample) ) {
|
||||||
|
double rg_pval = ( pvalByReadGroup.get(rg) == null ? 1 : pvalByReadGroup.get(rg) );
|
||||||
|
p_value = p_value*rg_pval;
|
||||||
|
containsAberrantReads = containsAberrantReads || rg_pval < LIMIT;
|
||||||
|
limit = limit*SCALE;
|
||||||
|
logger.debug(rg);
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("%s\t%-13s\t%s\t%s%n", sample, String.format("%e",p_value), ( p_value < limit ? "aberrant" : "nominal"), ( containsAberrantReads ? "contains_aberrant_RG" : "no_aberrant_RG"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue