Added - a walker that outputs relevant information about false negatives given a bunch of hapmap individuals and corresponding integration tests for it.
This will output for hapmap variant sites: chromosome position ref allele variant allele number of variant alleles of the individuals depth of coverage power to detect singletons at lod 3 number of variant bases seen whether or not variant was called git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2068 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b68d6e06b7
commit
be31d7f4cc
|
|
@ -0,0 +1,185 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.varianteval.ConcordanceTruthTable;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.LinkedList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Nov 12, 2009
|
||||
* Time: 12:31:58 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class HapmapPoolAllelicInfoWalker extends LocusWalker<String, PrintWriter> {
|
||||
@Argument(fullName="outputFile", shortName="of", doc="File to write to", required=true)
|
||||
public String outputFileString = null;
|
||||
@Argument(fullName="numIndividualsInPool", shortName="ps",doc="Pool size",required = true)
|
||||
public int poolSize = -1;
|
||||
@Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true)
|
||||
public String sampleNameFile = null;
|
||||
|
||||
private PrintWriter output;
|
||||
|
||||
private String[] sampleNames = null;
|
||||
private PowerBelowFrequencyWalker powerWalker = null;
|
||||
private ConcordanceTruthTable ctt = null;
|
||||
|
||||
public void initialize() {
|
||||
sampleNames = generateNameTableFromFile(sampleNameFile);
|
||||
powerWalker = new PowerBelowFrequencyWalker();
|
||||
powerWalker.initialize();
|
||||
powerWalker.setPoolSize(poolSize);
|
||||
ctt = new ConcordanceTruthTable(poolSize);
|
||||
}
|
||||
|
||||
public PrintWriter reduceInit() {
|
||||
try {
|
||||
output = new PrintWriter(outputFileString);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("File "+outputFileString+" could not be opened.", e);
|
||||
}
|
||||
output.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called");
|
||||
//System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called");
|
||||
return output;
|
||||
}
|
||||
|
||||
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc loc = context.getLocation();
|
||||
String chrom = loc.getContig();
|
||||
long pos = loc.getStart();
|
||||
char refBase = Character.toUpperCase(ref.getBase());
|
||||
List<Pair<Genotype, Genotype>> chips = getChips(sampleNames, tracker);
|
||||
Pair<Genotype,Integer> alleleFreqInfo = ctt.getPooledAlleleFrequency(chips,refBase);
|
||||
char alternate;
|
||||
if ( alleleFreqInfo.getFirst() != null && alleleFreqInfo.getFirst().isVariant(refBase)) {
|
||||
//System.out.println(refBase + " " + alleleFreqInfo.getFirst().getBases());
|
||||
alternate = getAlternateBase(alleleFreqInfo.getFirst(),refBase);
|
||||
|
||||
} else {
|
||||
return null; // early return
|
||||
}
|
||||
int numVariantAllele = alleleFreqInfo.getSecond();
|
||||
int depth = context.numReads();
|
||||
double power = powerWalker.calculatePowerAtFrequency(context,numVariantAllele);
|
||||
int called;
|
||||
Variation call = (Variation) tracker.lookup("calls",null);
|
||||
if ( call == null ) {
|
||||
called = 0;
|
||||
} else if ( call.isReference() ) {
|
||||
called = 0;
|
||||
} else {
|
||||
called = 1;
|
||||
}
|
||||
|
||||
ReadBackedPileup p = new ReadBackedPileup(ref.getBase(),context);
|
||||
int support = p.getBasePileupAsCounts()[BaseUtils.simpleBaseToBaseIndex(alternate)];
|
||||
|
||||
// sanity check
|
||||
if ( refBase == alternate ) {
|
||||
if ( alleleFreqInfo.getFirst().isVariant(refBase) ) {
|
||||
logger.warn("Called as a variant! Ref: "+ refBase +"Chip data: " + alleleFreqInfo.getFirst().getBases());
|
||||
}
|
||||
}
|
||||
|
||||
return String.format("%s\t%d\t%c\t%c\t%d\t%d\t%f\t%d\t%d",chrom,pos,refBase,alternate,numVariantAllele,depth,power,support,called);
|
||||
|
||||
}
|
||||
|
||||
public char getAlternateBase(Genotype g, char ref) {
|
||||
char[] bases = g.getBases().toCharArray();
|
||||
if ( Character.toUpperCase(bases[0]) != ref )
|
||||
return bases[0];
|
||||
else
|
||||
return bases[1];
|
||||
}
|
||||
|
||||
public PrintWriter reduce(String s, PrintWriter p) {
|
||||
if ( s == null ) {
|
||||
// do nothing
|
||||
return p;
|
||||
} else {
|
||||
//System.out.printf("%s%n",s);
|
||||
output.printf("%s%n",s);
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(PrintWriter p) {
|
||||
output.close();
|
||||
}
|
||||
|
||||
private List<Pair<Genotype,Genotype>> getChips(String[] rodNames, RefMetaDataTracker tracker) {
|
||||
List<Pair<Genotype, Genotype>> chips = new ArrayList <Pair<Genotype,Genotype>>(rodNames.length);
|
||||
for ( String name : rodNames ) {
|
||||
RODRecordList<ReferenceOrderedDatum> rods = tracker.getTrackData(name, null);
|
||||
Variation chip = (rods == null ? null : (Variation)rods.getRecords().get(0));
|
||||
if ( chip != null ) {
|
||||
// chips must be Genotypes
|
||||
if ( !(chip instanceof VariantBackedByGenotype) )
|
||||
throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data");
|
||||
chips.add(new Pair<Genotype,Genotype>(((VariantBackedByGenotype)chip).getCalledGenotype(),null));
|
||||
}
|
||||
}
|
||||
|
||||
return chips;
|
||||
}
|
||||
// private methods for reading in names from a file
|
||||
|
||||
private String[] generateNameTableFromFile(String file) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(file));
|
||||
} catch( FileNotFoundException e) {
|
||||
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
LinkedList<String> nameList = new LinkedList<String>();
|
||||
|
||||
while(continueReading(reader)) {
|
||||
String line = readLine(reader);
|
||||
nameList.add(line);
|
||||
}
|
||||
|
||||
return nameList.toArray(new String[nameList.size()]);
|
||||
}
|
||||
|
||||
private boolean continueReading(BufferedReader reader) {
|
||||
boolean continueReading;
|
||||
try {
|
||||
continueReading = reader.ready();
|
||||
} catch(IOException e) {
|
||||
continueReading = false;
|
||||
}
|
||||
return continueReading;
|
||||
}
|
||||
|
||||
private String readLine(BufferedReader reader) {
|
||||
String line;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch( IOException e) {
|
||||
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
|
||||
throw new StingException(errMsg,e);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Nov 18, 2009
|
||||
* Time: 9:29:31 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class HapmapPoolAllelicInfoIntegrationTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
public void testFHSPool3() {
|
||||
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
|
||||
+ "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
|
||||
+ "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
|
||||
+ "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
|
||||
+ "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
|
||||
+ "-ps 40 -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
|
||||
String md5ForThisTest = "9e44a04ea574a7f76efbc3a1f8cc710a";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
|
||||
executeTest("Pool 3 of FHS Pilot on testbed intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFHSPool3NoIntervals() {
|
||||
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
|
||||
+ "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
|
||||
+ "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
|
||||
+ "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
|
||||
+ "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
|
||||
+ "-ps 40";
|
||||
String md5ForThisTest = "571c10a635a84d0bbd453897ef4fb3c6";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
|
||||
executeTest("Pool 3 of FHS Pilot without testbed intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSmallPool() {
|
||||
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878_NA12891_samples.txt "
|
||||
+"-B NA12878,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12878.gff "
|
||||
+"-B NA12891,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12891.gff "
|
||||
+"-I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.recal.annotation_subset.bam "
|
||||
+"-L chr1:14000000-18000000 -B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.ssg1b.geli.calls "
|
||||
+"-of %s -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -ps 2";
|
||||
String md5ForThisTest = "46c6a22ab57c40453f29b99a57288e0e";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
|
||||
executeTest("Test on pool of two individuals -- CEU father+daughter chips against subset of the father's calls; power at lod 3", spec);
|
||||
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue