a) Fully functional and working multiallelic exact model for pools. Needs cleanup/more testing. b) Better unit test for pool genotype likelihoods - it now optionally generates actual noisy pileups that can be used for assessing GL accuracy, c) Totally experimental, hidden option in VariantsToTable to output genotype fields. Specifying -GF will output columns of form Sample.FieldName - needs also more testing

This commit is contained in:
Guillermo del Angel 2012-05-14 10:55:35 -04:00
parent 67e5c3ff9f
commit ae26f0fe14
2 changed files with 79 additions and 9 deletions

View File

@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -114,6 +117,10 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
public List<String> fieldsToTake = new ArrayList<String>();
@Hidden
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each field to capture for output in the table", required=true)
public List<String> genotypeFieldsToTake = new ArrayList<String>();
/**
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
* Throwing this flag will cause VariantsToTable to emit values regardless of the FILTER field value.
@ -151,9 +158,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
public boolean ALLOW_MISSING_DATA = false;
private final static String MISSING_DATA = "NA";
private TreeSet<String> samples = new TreeSet<String>();
public void initialize() {
String genotypeHeader = "";
if (!genotypeFieldsToTake.isEmpty()) {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants);
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
samples.addAll(vcfSamples);
StringBuilder sb = new StringBuilder();
for (final String sample : samples) {
for (final String gf : genotypeFieldsToTake) {
sb.append(sample+"."+gf+"\t");
}
}
genotypeHeader = sb.toString();
}
// print out the header
out.println(Utils.join("\t", fieldsToTake));
out.println(Utils.join("\t", fieldsToTake) + "\t"+genotypeHeader);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -162,7 +186,8 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
if ( showFiltered || vc.isNotFiltered() ) {
for ( final List<String> record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) )
for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples,
ALLOW_MISSING_DATA, splitMultiAllelic) )
out.println(Utils.join("\t", record));
}
}
@ -186,16 +211,25 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
*
* @param vc the VariantContext whose field values we can to capture
* @param fields a non-null list of fields to capture from VC
* @param genotypeFields a (possibly) null) list of fields to capture from each genotype
* @param samples set of samples in vc, can be null in case of sites-only file
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
* @return List of lists of field values
*/
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean splitMultiAllelic) {
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, List<String> genotypeFields,
Set<String> samples, boolean allowMissingData, boolean splitMultiAllelic) {
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
int numFields = fields.size();
final boolean addGenotypeFields = (genotypeFields != null && !genotypeFields.isEmpty() && samples != null && !samples.isEmpty());
if (addGenotypeFields)
numFields += genotypeFields.size()*samples.size();
for ( int i = 0; i < numRecordsToProduce; i++ )
records.add(new ArrayList<String>(fields.size()));
records.add(new ArrayList<String>(numFields));
for ( String field : fields ) {
@ -228,6 +262,16 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
}
if (addGenotypeFields) {
for (final String sample : samples) {
for (final String gf : genotypeFields) {
if (vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf))
addFieldValue(vc.getGenotype(sample).getAttribute(gf),records);
else
addFieldValue(MISSING_DATA, records);
}
}
}
return records;
}
@ -253,7 +297,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
return extractFields(vc, fields, allowMissingData, false);
return extractFields(vc, fields, null, null, allowMissingData, false);
}
//
// default reduce -- doesn't do anything at all

View File

@ -26,10 +26,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@ -94,10 +97,14 @@ public class ArtificialReadPileupTestProvider {
public GenomeLocParser getGenomeLocParser() { return genomeLocParser; }
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele) {
return getAlignmentContextFromAlleles(eventLength, altBases, numReadsPerAllele, false, BASE_QUAL);
}
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele,
boolean addBaseErrors, int phredScaledBaseErrorRate) {
// RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext);
ArrayList vcAlleles = new ArrayList<Allele>();
ArrayList<Allele> vcAlleles = new ArrayList<Allele>();
Allele refAllele, altAllele;
if (eventLength == 0) {// SNP case
refAllele =Allele.create(refBases.substring(offset,offset+1),true);
@ -128,7 +135,7 @@ public class ArtificialReadPileupTestProvider {
Map<String,AlignmentContext> contexts = new HashMap<String,AlignmentContext>();
for (String sample: sampleNames) {
AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample));
AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample, addBaseErrors, phredScaledBaseErrorRate));
contexts.put(sample,context);
}
@ -143,7 +150,7 @@ public class ArtificialReadPileupTestProvider {
return rg;
}
private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases,
int[] numReadsPerAllele, String sample) {
int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) {
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
int readStart = contigStart;
int offset = (contigStop-contigStart+1)/2;
@ -158,8 +165,11 @@ public class ArtificialReadPileupTestProvider {
for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) {
byte[] readBases = trueHaplotype(allele, offset, refAlleleLength);
if (addErrors)
addBaseErrors(readBases, phredScaledErrorRate);
byte[] readQuals = new byte[readBases.length];
Arrays.fill(readQuals, (byte)BASE_QUAL);
Arrays.fill(readQuals, (byte)phredScaledErrorRate);
GATKSAMRecord read = new GATKSAMRecord(header);
read.setBaseQualities(readQuals);
@ -208,4 +218,20 @@ public class ArtificialReadPileupTestProvider {
}
private void addBaseErrors(final byte[] readBases, final int phredScaledErrorRate) {
double errorProbability = QualityUtils.qualToErrorProb((byte)phredScaledErrorRate);
for (int k=0; k < readBases.length; k++) {
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < errorProbability) {
// random offset
int offset = BaseUtils.simpleBaseToBaseIndex(readBases[k]); //0..3
offset += (GenomeAnalysisEngine.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3
offset %= 4;
readBases[k] = BaseUtils.baseIndexToSimpleBase(offset);
}
}
}
}