a) Fully functional and working multiallelic exact model for pools. Needs cleanup/more testing. b) Better unit test for pool genotype likelihoods - it now optionally generates actual noisy pileups that can be used for assessing GL accuracy, c) Totally experimental, hidden option in VariantsToTable to output genotype fields. Specifying -GF will output columns of form Sample.FieldName - needs also more testing
This commit is contained in:
parent
67e5c3ff9f
commit
ae26f0fe14
|
|
@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
import org.broadinstitute.sting.commandline.*;
|
import org.broadinstitute.sting.commandline.*;
|
||||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
|
@ -114,6 +117,10 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
|
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
|
||||||
public List<String> fieldsToTake = new ArrayList<String>();
|
public List<String> fieldsToTake = new ArrayList<String>();
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each field to capture for output in the table", required=true)
|
||||||
|
public List<String> genotypeFieldsToTake = new ArrayList<String>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
||||||
* Throwing this flag will cause VariantsToTable to emit values regardless of the FILTER field value.
|
* Throwing this flag will cause VariantsToTable to emit values regardless of the FILTER field value.
|
||||||
|
|
@ -151,9 +158,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
public boolean ALLOW_MISSING_DATA = false;
|
public boolean ALLOW_MISSING_DATA = false;
|
||||||
private final static String MISSING_DATA = "NA";
|
private final static String MISSING_DATA = "NA";
|
||||||
|
|
||||||
|
private TreeSet<String> samples = new TreeSet<String>();
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
|
String genotypeHeader = "";
|
||||||
|
if (!genotypeFieldsToTake.isEmpty()) {
|
||||||
|
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants);
|
||||||
|
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
|
||||||
|
samples.addAll(vcfSamples);
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (final String sample : samples) {
|
||||||
|
for (final String gf : genotypeFieldsToTake) {
|
||||||
|
sb.append(sample+"."+gf+"\t");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
genotypeHeader = sb.toString();
|
||||||
|
}
|
||||||
// print out the header
|
// print out the header
|
||||||
out.println(Utils.join("\t", fieldsToTake));
|
out.println(Utils.join("\t", fieldsToTake) + "\t"+genotypeHeader);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
@ -162,7 +186,8 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
|
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
|
||||||
if ( showFiltered || vc.isNotFiltered() ) {
|
if ( showFiltered || vc.isNotFiltered() ) {
|
||||||
for ( final List<String> record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) )
|
for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples,
|
||||||
|
ALLOW_MISSING_DATA, splitMultiAllelic) )
|
||||||
out.println(Utils.join("\t", record));
|
out.println(Utils.join("\t", record));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -186,16 +211,25 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
*
|
*
|
||||||
* @param vc the VariantContext whose field values we can to capture
|
* @param vc the VariantContext whose field values we can to capture
|
||||||
* @param fields a non-null list of fields to capture from VC
|
* @param fields a non-null list of fields to capture from VC
|
||||||
|
* @param genotypeFields a (possibly) null) list of fields to capture from each genotype
|
||||||
|
* @param samples set of samples in vc, can be null in case of sites-only file
|
||||||
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
|
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
|
||||||
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
|
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
|
||||||
* @return List of lists of field values
|
* @return List of lists of field values
|
||||||
*/
|
*/
|
||||||
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean splitMultiAllelic) {
|
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, List<String> genotypeFields,
|
||||||
|
Set<String> samples, boolean allowMissingData, boolean splitMultiAllelic) {
|
||||||
|
|
||||||
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
|
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
|
||||||
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
|
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
|
||||||
|
|
||||||
|
int numFields = fields.size();
|
||||||
|
final boolean addGenotypeFields = (genotypeFields != null && !genotypeFields.isEmpty() && samples != null && !samples.isEmpty());
|
||||||
|
if (addGenotypeFields)
|
||||||
|
numFields += genotypeFields.size()*samples.size();
|
||||||
|
|
||||||
for ( int i = 0; i < numRecordsToProduce; i++ )
|
for ( int i = 0; i < numRecordsToProduce; i++ )
|
||||||
records.add(new ArrayList<String>(fields.size()));
|
records.add(new ArrayList<String>(numFields));
|
||||||
|
|
||||||
for ( String field : fields ) {
|
for ( String field : fields ) {
|
||||||
|
|
||||||
|
|
@ -228,6 +262,16 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (addGenotypeFields) {
|
||||||
|
for (final String sample : samples) {
|
||||||
|
for (final String gf : genotypeFields) {
|
||||||
|
if (vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf))
|
||||||
|
addFieldValue(vc.getGenotype(sample).getAttribute(gf),records);
|
||||||
|
else
|
||||||
|
addFieldValue(MISSING_DATA, records);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return records;
|
return records;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -253,7 +297,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||||
return extractFields(vc, fields, allowMissingData, false);
|
return extractFields(vc, fields, null, null, allowMissingData, false);
|
||||||
}
|
}
|
||||||
//
|
//
|
||||||
// default reduce -- doesn't do anything at all
|
// default reduce -- doesn't do anything at all
|
||||||
|
|
|
||||||
|
|
@ -26,10 +26,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||||
|
|
@ -94,10 +97,14 @@ public class ArtificialReadPileupTestProvider {
|
||||||
public GenomeLocParser getGenomeLocParser() { return genomeLocParser; }
|
public GenomeLocParser getGenomeLocParser() { return genomeLocParser; }
|
||||||
|
|
||||||
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele) {
|
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele) {
|
||||||
|
return getAlignmentContextFromAlleles(eventLength, altBases, numReadsPerAllele, false, BASE_QUAL);
|
||||||
|
}
|
||||||
|
public Map<String,AlignmentContext> getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele,
|
||||||
|
boolean addBaseErrors, int phredScaledBaseErrorRate) {
|
||||||
// RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext);
|
// RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext);
|
||||||
|
|
||||||
|
|
||||||
ArrayList vcAlleles = new ArrayList<Allele>();
|
ArrayList<Allele> vcAlleles = new ArrayList<Allele>();
|
||||||
Allele refAllele, altAllele;
|
Allele refAllele, altAllele;
|
||||||
if (eventLength == 0) {// SNP case
|
if (eventLength == 0) {// SNP case
|
||||||
refAllele =Allele.create(refBases.substring(offset,offset+1),true);
|
refAllele =Allele.create(refBases.substring(offset,offset+1),true);
|
||||||
|
|
@ -128,7 +135,7 @@ public class ArtificialReadPileupTestProvider {
|
||||||
Map<String,AlignmentContext> contexts = new HashMap<String,AlignmentContext>();
|
Map<String,AlignmentContext> contexts = new HashMap<String,AlignmentContext>();
|
||||||
|
|
||||||
for (String sample: sampleNames) {
|
for (String sample: sampleNames) {
|
||||||
AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample));
|
AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample, addBaseErrors, phredScaledBaseErrorRate));
|
||||||
contexts.put(sample,context);
|
contexts.put(sample,context);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -143,7 +150,7 @@ public class ArtificialReadPileupTestProvider {
|
||||||
return rg;
|
return rg;
|
||||||
}
|
}
|
||||||
private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases,
|
private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases,
|
||||||
int[] numReadsPerAllele, String sample) {
|
int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) {
|
||||||
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
|
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
|
||||||
int readStart = contigStart;
|
int readStart = contigStart;
|
||||||
int offset = (contigStop-contigStart+1)/2;
|
int offset = (contigStop-contigStart+1)/2;
|
||||||
|
|
@ -158,8 +165,11 @@ public class ArtificialReadPileupTestProvider {
|
||||||
|
|
||||||
for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) {
|
for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) {
|
||||||
byte[] readBases = trueHaplotype(allele, offset, refAlleleLength);
|
byte[] readBases = trueHaplotype(allele, offset, refAlleleLength);
|
||||||
|
if (addErrors)
|
||||||
|
addBaseErrors(readBases, phredScaledErrorRate);
|
||||||
|
|
||||||
byte[] readQuals = new byte[readBases.length];
|
byte[] readQuals = new byte[readBases.length];
|
||||||
Arrays.fill(readQuals, (byte)BASE_QUAL);
|
Arrays.fill(readQuals, (byte)phredScaledErrorRate);
|
||||||
|
|
||||||
GATKSAMRecord read = new GATKSAMRecord(header);
|
GATKSAMRecord read = new GATKSAMRecord(header);
|
||||||
read.setBaseQualities(readQuals);
|
read.setBaseQualities(readQuals);
|
||||||
|
|
@ -208,4 +218,20 @@ public class ArtificialReadPileupTestProvider {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addBaseErrors(final byte[] readBases, final int phredScaledErrorRate) {
|
||||||
|
double errorProbability = QualityUtils.qualToErrorProb((byte)phredScaledErrorRate);
|
||||||
|
|
||||||
|
for (int k=0; k < readBases.length; k++) {
|
||||||
|
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < errorProbability) {
|
||||||
|
// random offset
|
||||||
|
int offset = BaseUtils.simpleBaseToBaseIndex(readBases[k]); //0..3
|
||||||
|
offset += (GenomeAnalysisEngine.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3
|
||||||
|
offset %= 4;
|
||||||
|
readBases[k] = BaseUtils.baseIndexToSimpleBase(offset);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue