a) Fully functional and working multiallelic exact model for pools. Needs cleanup/more testing. b) Better unit test for pool genotype likelihoods - it now optionally generates actual noisy pileups that can be used for assessing GL accuracy, c) Totally experimental, hidden option in VariantsToTable to output genotype fields. Specifying -GF will output columns of form Sample.FieldName - needs also more testing
This commit is contained in:
parent
67e5c3ff9f
commit
ae26f0fe14
|
|
@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
|
@ -114,6 +117,10 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
|
||||
public List<String> fieldsToTake = new ArrayList<String>();
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each field to capture for output in the table", required=true)
|
||||
public List<String> genotypeFieldsToTake = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
||||
* Throwing this flag will cause VariantsToTable to emit values regardless of the FILTER field value.
|
||||
|
|
@ -151,9 +158,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
public boolean ALLOW_MISSING_DATA = false;
|
||||
private final static String MISSING_DATA = "NA";
|
||||
|
||||
private TreeSet<String> samples = new TreeSet<String>();
|
||||
|
||||
public void initialize() {
|
||||
|
||||
String genotypeHeader = "";
|
||||
if (!genotypeFieldsToTake.isEmpty()) {
|
||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants);
|
||||
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
|
||||
samples.addAll(vcfSamples);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (final String sample : samples) {
|
||||
for (final String gf : genotypeFieldsToTake) {
|
||||
sb.append(sample+"."+gf+"\t");
|
||||
}
|
||||
}
|
||||
genotypeHeader = sb.toString();
|
||||
}
|
||||
// print out the header
|
||||
out.println(Utils.join("\t", fieldsToTake));
|
||||
out.println(Utils.join("\t", fieldsToTake) + "\t"+genotypeHeader);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
@ -162,7 +186,8 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
|
||||
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
|
||||
if ( showFiltered || vc.isNotFiltered() ) {
|
||||
for ( final List<String> record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) )
|
||||
for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples,
|
||||
ALLOW_MISSING_DATA, splitMultiAllelic) )
|
||||
out.println(Utils.join("\t", record));
|
||||
}
|
||||
}
|
||||
|
|
@ -186,16 +211,25 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
*
|
||||
* @param vc the VariantContext whose field values we can to capture
|
||||
* @param fields a non-null list of fields to capture from VC
|
||||
* @param genotypeFields a (possibly) null) list of fields to capture from each genotype
|
||||
* @param samples set of samples in vc, can be null in case of sites-only file
|
||||
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
|
||||
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
|
||||
* @return List of lists of field values
|
||||
*/
|
||||
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean splitMultiAllelic) {
|
||||
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, List<String> genotypeFields,
|
||||
Set<String> samples, boolean allowMissingData, boolean splitMultiAllelic) {
|
||||
|
||||
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
|
||||
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
|
||||
|
||||
int numFields = fields.size();
|
||||
final boolean addGenotypeFields = (genotypeFields != null && !genotypeFields.isEmpty() && samples != null && !samples.isEmpty());
|
||||
if (addGenotypeFields)
|
||||
numFields += genotypeFields.size()*samples.size();
|
||||
|
||||
for ( int i = 0; i < numRecordsToProduce; i++ )
|
||||
records.add(new ArrayList<String>(fields.size()));
|
||||
records.add(new ArrayList<String>(numFields));
|
||||
|
||||
for ( String field : fields ) {
|
||||
|
||||
|
|
@ -228,6 +262,16 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
if (addGenotypeFields) {
|
||||
for (final String sample : samples) {
|
||||
for (final String gf : genotypeFields) {
|
||||
if (vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf))
|
||||
addFieldValue(vc.getGenotype(sample).getAttribute(gf),records);
|
||||
else
|
||||
addFieldValue(MISSING_DATA, records);
|
||||
}
|
||||
}
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
|
|
@ -253,7 +297,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
return extractFields(vc, fields, allowMissingData, false);
|
||||
return extractFields(vc, fields, null, null, allowMissingData, false);
|
||||
}
|
||||
//
|
||||
// default reduce -- doesn't do anything at all
|
||||
|
|
|
|||
|
|
@ -26,10 +26,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -94,10 +97,14 @@ public class ArtificialReadPileupTestProvider {
|
|||
public GenomeLocParser getGenomeLocParser() { return genomeLocParser; }
|
||||
|
||||
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele) {
|
||||
return getAlignmentContextFromAlleles(eventLength, altBases, numReadsPerAllele, false, BASE_QUAL);
|
||||
}
|
||||
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele,
|
||||
boolean addBaseErrors, int phredScaledBaseErrorRate) {
|
||||
// RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext);
|
||||
|
||||
|
||||
ArrayList vcAlleles = new ArrayList<Allele>();
|
||||
ArrayList<Allele> vcAlleles = new ArrayList<Allele>();
|
||||
Allele refAllele, altAllele;
|
||||
if (eventLength == 0) {// SNP case
|
||||
refAllele =Allele.create(refBases.substring(offset,offset+1),true);
|
||||
|
|
@ -128,7 +135,7 @@ public class ArtificialReadPileupTestProvider {
|
|||
Map<String,AlignmentContext> contexts = new HashMap<String,AlignmentContext>();
|
||||
|
||||
for (String sample: sampleNames) {
|
||||
AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample));
|
||||
AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample, addBaseErrors, phredScaledBaseErrorRate));
|
||||
contexts.put(sample,context);
|
||||
|
||||
}
|
||||
|
|
@ -143,7 +150,7 @@ public class ArtificialReadPileupTestProvider {
|
|||
return rg;
|
||||
}
|
||||
private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases,
|
||||
int[] numReadsPerAllele, String sample) {
|
||||
int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) {
|
||||
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
|
||||
int readStart = contigStart;
|
||||
int offset = (contigStop-contigStart+1)/2;
|
||||
|
|
@ -158,8 +165,11 @@ public class ArtificialReadPileupTestProvider {
|
|||
|
||||
for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) {
|
||||
byte[] readBases = trueHaplotype(allele, offset, refAlleleLength);
|
||||
if (addErrors)
|
||||
addBaseErrors(readBases, phredScaledErrorRate);
|
||||
|
||||
byte[] readQuals = new byte[readBases.length];
|
||||
Arrays.fill(readQuals, (byte)BASE_QUAL);
|
||||
Arrays.fill(readQuals, (byte)phredScaledErrorRate);
|
||||
|
||||
GATKSAMRecord read = new GATKSAMRecord(header);
|
||||
read.setBaseQualities(readQuals);
|
||||
|
|
@ -208,4 +218,20 @@ public class ArtificialReadPileupTestProvider {
|
|||
|
||||
}
|
||||
|
||||
private void addBaseErrors(final byte[] readBases, final int phredScaledErrorRate) {
|
||||
double errorProbability = QualityUtils.qualToErrorProb((byte)phredScaledErrorRate);
|
||||
|
||||
for (int k=0; k < readBases.length; k++) {
|
||||
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < errorProbability) {
|
||||
// random offset
|
||||
int offset = BaseUtils.simpleBaseToBaseIndex(readBases[k]); //0..3
|
||||
offset += (GenomeAnalysisEngine.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3
|
||||
offset %= 4;
|
||||
readBases[k] = BaseUtils.baseIndexToSimpleBase(offset);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue