Updated the CallsetConcordance classes to use new VCF Variation code... and uncovered a whole bunch of VCF bugs in the process. I'm not convinced that I got them all, so I'll unit test like crazy when the refactoring is done.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2272 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-06 11:43:40 +00:00
parent b6f8e33f4c
commit c0528cd88e
16 changed files with 142 additions and 69 deletions

View File

@ -259,6 +259,16 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
return mCurrentRecord.getGenotypes();
}
/**
* get the genotypes
*
* @return a list of the genotypes
*/
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
assertNotNull();
return mCurrentRecord.getVCFGenotypeRecords();
}
/**
* do we have the specified genotype? not all backedByGenotypes
* have all the genotype data.

View File

@ -130,19 +130,15 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
// pull out all of the individual calls from the rods and insert into a map based on the
// mapping from rod/sample to uniquified name
HashMap<String, VCFGenotypeCall> samplesToRecords = new HashMap<String, VCFGenotypeCall>();
HashMap<String, Genotype> samplesToRecords = new HashMap<String, Genotype>();
for ( RodVCF rod : vcfRods ) {
List<Genotype> records = rod.getGenotypes();
for ( Genotype g : records ) {
if ( !(g instanceof VCFGenotypeCall) )
throw new StingException("Expected VCF rod Genotypes to be of type VCFGenotypeCall");
VCFGenotypeCall vcfCall = (VCFGenotypeCall)g;
String uniquifiedSample = rodNamesToSampleNames.get(new Pair<String, String>(rod.getName(), vcfCall.getSampleName()));
List<VCFGenotypeRecord> records = rod.getVCFGenotypeRecords();
for ( VCFGenotypeRecord vcfRec : records ) {
String uniquifiedSample = rodNamesToSampleNames.get(new Pair<String, String>(rod.getName(), vcfRec.getSampleName()));
if ( uniquifiedSample == null )
throw new StingException("Unexpected sample encountered: " + vcfCall.getSampleName() + " in rod " + rod.getName());
throw new StingException("Unexpected sample encountered: " + vcfRec.getSampleName() + " in rod " + rod.getName());
samplesToRecords.put(uniquifiedSample, vcfCall);
samplesToRecords.put(uniquifiedSample, vcfRec);
}
}
@ -173,4 +169,4 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
vcfWriter.close();
out.printf("Processed %d loci.\n", result);
}
}
}

View File

@ -1,7 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import java.util.Map;
import java.util.Set;
@ -9,6 +9,6 @@ import java.util.Set;
public interface ConcordanceType {
public void initialize(Map<String,String> args, Set<String> samples);
public String computeConcordance(Map<String, VCFGenotypeCall> samplesToRecords, ReferenceContext ref);
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref);
public String getInfoName();
}

View File

@ -4,7 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import java.util.*;
@ -53,10 +53,10 @@ public class IndelSubsets implements ConcordanceType {
}
}
public String computeConcordance(Map<String, VCFGenotypeCall> samplesToRecords, ReferenceContext ref) {
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
VCFGenotypeCall indel1 = samplesToRecords.get(sample1);
VCFGenotypeCall indel2 = samplesToRecords.get(sample2);
Genotype indel1 = samplesToRecords.get(sample1);
Genotype indel2 = samplesToRecords.get(sample2);
int set1 = ( indel1 != null && !indel1.isPointGenotype() ? 0 : 1 );
int set2 = ( indel2 != null && !indel2.isPointGenotype() ? 0 : 1 );

View File

@ -1,7 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import java.util.*;
import java.util.Map.Entry;
@ -18,12 +18,12 @@ public class NWayVenn implements ConcordanceType {
public void initialize(Map<String, String> args, Set<String> samples) { }
public String computeConcordance(Map<String, VCFGenotypeCall> samplesToRecords, ReferenceContext ref) {
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
if ( samplesToRecords.size() == 0 )
return null;
TreeSet<String> concordantSamples = new TreeSet<String>();
for ( Entry<String, VCFGenotypeCall> entry : samplesToRecords.entrySet() ) {
for ( Entry<String, Genotype> entry : samplesToRecords.entrySet() ) {
concordantSamples.add(entry.getKey());
}
@ -39,4 +39,4 @@ public class NWayVenn implements ConcordanceType {
}
public String getInfoName() { return "NwayVenn"; }
}
}

View File

@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import java.util.*;
@ -29,10 +29,10 @@ public class SNPGenotypeConcordance implements ConcordanceType {
sample2 = iter.next();
}
public String computeConcordance(Map<String, VCFGenotypeCall> samplesToRecords, ReferenceContext ref) {
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
VCFGenotypeCall call1 = samplesToRecords.get(sample1);
VCFGenotypeCall call2 = samplesToRecords.get(sample2);
Genotype call1 = samplesToRecords.get(sample1);
Genotype call2 = samplesToRecords.get(sample2);
// the only reason they would be null is a lack of coverage
if ( call1 == null || call2 == null ) {
@ -78,9 +78,9 @@ public class SNPGenotypeConcordance implements ConcordanceType {
}
// one is variant and the other is ref
else if ( call1.isPointGenotype() && call2.isVariant() && confidence1 >= Qscore )
else if ( call1.isPointGenotype() && call2.isVariant(ref.getBase()) && confidence1 >= Qscore )
return "set1VariantSet2Ref";
else if ( call2.isPointGenotype() && call1.isVariant() && confidence2 >= Qscore )
else if ( call2.isPointGenotype() && call1.isVariant(ref.getBase()) && confidence2 >= Qscore )
return "set1RefSet2Variant";
return null;

View File

@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
@ -25,10 +25,10 @@ public class SimpleVenn implements ConcordanceType {
sample2 = iter.next();
}
public String computeConcordance(Map<String, VCFGenotypeCall> samplesToRecords, ReferenceContext ref) {
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
VCFGenotypeCall call1 = samplesToRecords.get(sample1);
VCFGenotypeCall call2 = samplesToRecords.get(sample2);
Genotype call1 = samplesToRecords.get(sample1);
Genotype call2 = samplesToRecords.get(sample2);
if ( call1 == null && call2 == null )
return null;

View File

@ -25,8 +25,8 @@ public class VCFGenotypeEncoding {
public VCFGenotypeEncoding(String baseString) {
if ((baseString.length() == 1)) {
// are we an empty (no-call) genotype?
if (baseString.equals(VCFGenotypeRecord.EMPTY_GENOTYPE)) {
mBases = VCFGenotypeRecord.EMPTY_GENOTYPE;
if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
mBases = VCFGenotypeRecord.EMPTY_ALLELE;
mLength = 0;
mType = TYPE.UNCALLED;
} else if (!validBases(baseString)) {
@ -67,10 +67,14 @@ public class VCFGenotypeEncoding {
}
public boolean equals(Object obj) {
if (obj != null && (obj.getClass().equals(this.getClass()))) {
if ( obj == null )
return false;
if ( obj instanceof VCFGenotypeEncoding ) {
VCFGenotypeEncoding d = (VCFGenotypeEncoding) obj;
return (mType == d.mType) && (mBases.equals(d.mBases)) && (mLength == d.mLength);
}
if ( mType == TYPE.UNCALLED && obj.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) )
return true;
return false;
}
@ -84,7 +88,7 @@ public class VCFGenotypeEncoding {
/**
* dump the string representation of this genotype encoding
*
* @return
* @return string representation
*/
public String toString() {
StringBuilder builder = new StringBuilder();

View File

@ -16,8 +16,11 @@ import java.util.Map;
* <p/>
*/
public class VCFGenotypeRecord implements Genotype {
// the symbol for a empty genotype
public static final String EMPTY_GENOTYPE = ".";
// the symbols for an empty genotype
public static final String EMPTY_GENOTYPE = "./.";
public static final String EMPTY_ALLELE = ".";
public static final int MISSING_DEPTH = -1;
// what kind of phasing this genotype has
public enum PHASE {
@ -37,7 +40,7 @@ public class VCFGenotypeRecord implements Genotype {
private final Map<String, String> mFields = new HashMap<String, String>();
// our sample name
private final String mSampleName;
private String mSampleName;
/**
* Create a VCF genotype record
@ -58,6 +61,10 @@ public class VCFGenotypeRecord implements Genotype {
this.mRecord = record;
}
public void setSampleName(String name) {
mSampleName = name;
}
/**
* determine the phase of the genotype
*
@ -98,6 +105,15 @@ public class VCFGenotypeRecord implements Genotype {
return ( mFields.containsKey("GQ") ? Double.valueOf(mFields.get("GQ")) / 10.0 : 0.0);
}
public int getReadCount() {
int depth = MISSING_DEPTH;
if ( mFields.containsKey("RD") )
depth = Integer.valueOf(mFields.get("RD"));
else if ( mFields.containsKey("DP") )
depth = Integer.valueOf(mFields.get("DP"));
return depth;
}
public GenomeLoc getLocation() {
return mRecord != null ? mRecord.getLocation() : null;
}
@ -157,7 +173,7 @@ public class VCFGenotypeRecord implements Genotype {
boolean first = true;
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
str += VCFGenotypeRecord.EMPTY_GENOTYPE;
str += VCFGenotypeRecord.EMPTY_ALLELE;
else
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (first) {

View File

@ -213,8 +213,8 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE));
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE));
VCFGenotypeRecord record = new VCFGenotypeRecord(sampleName,
alleles,

View File

@ -65,7 +65,9 @@ class VCFParameters {
}
public void addAlternateBase(VCFGenotypeEncoding base) {
if (!alternateBases.contains(base) && !base.toString().equals(String.valueOf(this.getReferenceBase())))
if ( !alternateBases.contains(base) &&
!base.toString().equals(String.valueOf(getReferenceBase())) &&
!base.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) )
alternateBases.add(base);
}

View File

@ -206,15 +206,20 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
// if we have genotyping data, we try and extract the genotype fields
if (mHeader.hasGenotypingData()) {
String mFormatString = tokens[index];
String keyStrings[] = mFormatString.split(":");
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
index++;
String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
for (String str : mHeader.getGenotypeSamples()) {
if (!tokens[index].equalsIgnoreCase(VCFGenotypeRecord.EMPTY_GENOTYPE))
genotypeRecords.add(getVCFGenotype(str, mFormatString, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
index++;
}
return new VCFRecord(values, mFormatString, genotypeRecords);
VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords);
// associate the genotypes with this new record
for ( VCFGenotypeRecord gr : genotypeRecords )
gr.setVCFRecord(vrec);
return vrec;
}
return new VCFRecord(values);
}
@ -231,11 +236,25 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) {
return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase);
}
/**
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
*
* @param sampleName the sample name
* @param keyStrings the split format string for this record, which contains the keys for the genotype parameters
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
* @param referenceBase the reference base
*
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
// parameters to create the VCF genotype record
Map<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
String keyStrings[] = formatString.split(":");
for (String key : keyStrings) {
String parse;
@ -260,11 +279,13 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
}
if ( bases.size() > 0 && bases.get(0).equals(VCFGenotypeRecord.EMPTY_ALLELE) )
tagToValue.clear();
// catch some common errors, either there are too many field keys or there are two many field values
if (keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
throw new RuntimeException("VCFReader: genotype value count doesn't match the key count (expected "
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
else if (genotypeString.length() > 0)
else if ( genotypeString.length() > 0 )
throw new RuntimeException("VCFReader: genotype string contained additional unprocessed fields: " + genotypeString
+ ". This most likely means that the format string is shorter then the value fields.");
return new VCFGenotypeRecord(sampleName, bases, phase, tagToValue);
@ -280,8 +301,8 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
* @param bases the list of bases for this genotype call
*/
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_GENOTYPE)) {
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE));
} else {
int alleleValue = Integer.valueOf(alleleNumber);
// check to make sure the allele value is within bounds

View File

@ -85,11 +85,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
this.setQual(qual);
this.setFilterString(filters);
this.mInfoFields.putAll(infoFields);
mGenotypeFormatString = genotypeFormatString;
// associate the genotypes with this Variation, then add them
for ( VCFGenotypeRecord rec : genotypeObjects )
rec.setVCFRecord(this);
this.mGenotypeFormatString = genotypeFormatString;
this.mGenotypeFields.addAll(genotypeObjects);
}

View File

@ -5,8 +5,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*;
import java.util.Map.Entry;
@ -116,17 +115,13 @@ public class VCFUtils {
int freqsSeen = 0;
for ( RodVCF rod : rods ) {
List<Genotype> myGenotypes = rod.getGenotypes();
for ( Genotype g : myGenotypes ) {
if ( !(g instanceof VCFGenotypeCall) )
throw new StingException("Expected VCFGenotypeCall object but instead saw " + g.getClass().getSimpleName());
List<VCFGenotypeRecord> myGenotypes = rod.getVCFGenotypeRecords();
for ( VCFGenotypeRecord call : myGenotypes ) {
// set the name to be the new uniquified name and add it to the list of genotypes
VCFGenotypeCall call = (VCFGenotypeCall)g;
call.setSampleName(rodNamesToSampleNames.get(new Pair<String, String>(rod.getName(), call.getSampleName())));
if ( params.getPosition() < 1 )
params.setLocations(call.getLocation(), call.getReference());
params.addGenotypeRecord(createVCFGenotypeRecord(params, call));
params.setLocations(rod.getLocation(), call.getReference());
params.addGenotypeRecord(createVCFGenotypeRecord(params, call, rod.mCurrentRecord));
totalReadDepth += call.getReadCount();
}
@ -168,6 +163,39 @@ public class VCFUtils {
params.getGenotypesRecords());
}
/**
* create the VCF genotype record
*
* @param params the VCF parameters object
* @param gtype the genotype
* @param vcfrecord the VCF record
*
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeRecord gtype, VCFRecord vcfrecord) {
Map<String, String> map = new HashMap<String, String>();
// calculate the RMS mapping qualities and the read depth
int readDepth = gtype.getReadCount();
map.put("RD", String.valueOf(readDepth));
params.addFormatItem("RD");
double qual = 10.0 * gtype.getNegLog10PError();
map.put("GQ", String.format("%.2f", qual));
params.addFormatItem("GQ");
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
for (VCFGenotypeEncoding allele : alleles) {
params.addAlternateBase(allele);
}
VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(),
alleles,
VCFGenotypeRecord.PHASE.UNPHASED,
map);
record.setVCFRecord(vcfrecord);
return record;
}
/**
* create the VCF genotype record
*

View File

@ -61,7 +61,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
+ " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -A SecondBaseSkew"
+ " -sample variant -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pileup_test_chr15.vcf"
+ " -vcf %s -L chr15:46347148";
String expected_md5 = "fe7f6d2b48ecf2c1340b5de98a9d5614";
String expected_md5 = "160a8e3826eb745bcfe2f463f73e1ec7";
WalkerTestSpec spec = new WalkerTestSpec(test_args,1,Arrays.asList(expected_md5));
executeTest("Testing on locus with many indels", spec);
}

View File

@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testSimpleVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
Arrays.asList("2fc12b6f02f4cb589f2fd134e765d6b7"));
Arrays.asList("851b68004874f3a2e76d795e7401f8a0"));
executeTest("testSimpleVenn", spec);
}
@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testSNPConcordance() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("142bcfcc6bb404cd4bd1a4624fa9a15e"));
Arrays.asList("7afb56b30257fe2d66bee7a029d75685"));
executeTest("testSNPConcordance", spec);
}
@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testNWayVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
Arrays.asList("9a5910137b6b9745f6e0c3ee711a6bfa"));
Arrays.asList("f452c04c600ad10c054f18b0c77b53d5"));
executeTest("testNWayVenn", spec);
}
}