Generalizing the SequenomValidationConverter to be able to take in any arbitrary rod type (provided it can be converted to VariantContext).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3155 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-04-12 20:42:18 +00:00
parent 4bb8984f80
commit e413882302
2 changed files with 61 additions and 75 deletions

View File

@ -8,25 +8,19 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
import org.broadinstitute.sting.gatk.refdata.PlinkRod; import org.broadinstitute.sting.gatk.refdata.PlinkRod;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*; import java.util.*;
/** /**
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes) * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
*/ */
@Reference(window=@Window(start=0,stop=40)) @Reference(window=@Window(start=0,stop=40))
@Requires(value={},referenceMetaData=@RMD(name="sequenom",type= ReferenceOrderedDatum.class))
public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> { public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false) @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
protected double maxHardy = 20.0; protected double maxHardy = 20.0;
@ -35,9 +29,9 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false) @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
protected double maxHomNonref = 1.1; protected double maxHomNonref = 1.1;
@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+ //@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
"used for smart Hardy-Weinberg annotation",required = false) // "used for smart Hardy-Weinberg annotation",required = false)
public File popFile = null; //private File popFile = null;
// max allowable indel size (based on ref window) // max allowable indel size (based on ref window)
private static final int MAX_INDEL_SIZE = 40; private static final int MAX_INDEL_SIZE = 40;
@ -55,12 +49,12 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
private int numHomVarViolations = 0; private int numHomVarViolations = 0;
private int numTrueVariants = 0; private int numTrueVariants = 0;
private HashMap<String,String> samplesToPopulation; //private HashMap<String,String> samplesToPopulation;
public void initialize() { public void initialize() {
if ( popFile != null ) { //if ( popFile != null ) {
samplesToPopulation = parsePopulationFile(popFile); // samplesToPopulation = parsePopulationFile(popFile);
} //}
} }
public Integer reduceInit() { public Integer reduceInit() {
@ -72,26 +66,51 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
if ( tracker == null ) if ( tracker == null )
return null; return null;
// get the Plink rod at this locus if there is one // get the sequenom rod at this locus if there is one
PlinkRod plinkRod = null; List<Object> rods = tracker.getReferenceMetaData("sequenom");
Iterator<GATKFeature> rods = tracker.getAllRods().iterator(); // ignore places where we don't have a variant
while (rods.hasNext()) { if ( rods.size() == 0 )
Object rod = rods.next().getUnderlyingObject();
if ( rod instanceof PlinkRod ) {
plinkRod = (PlinkRod)rod;
break;
}
}
if ( plinkRod == null )
return null; return null;
if ( sampleNames == null ) Object rod = rods.get(0);
sampleNames = new TreeSet<String>(plinkRod.getSampleNames());
return addVariantInformationToCall(ref, plinkRod); // determine the reference allele
Allele refAllele = determineRefAllele(rod, ref);
VariantContext vc = VariantContextAdaptors.toVariantContext("sequenom", rod, refAllele);
if ( sampleNames == null )
sampleNames = new TreeSet<String>(vc.getSampleNames());
return addVariantInformationToCall(ref, vc, rod);
} }
private Allele determineRefAllele(Object rod, ReferenceContext ref) {
Allele refAllele;
// ugly hack to get around the fact that the Plink rod needs
// a very specific determination of the reference allele
if ( rod instanceof PlinkRod ) {
PlinkRod plink = (PlinkRod)rod;
if ( !plink.isIndel() ) {
refAllele = new Allele(Character.toString(ref.getBase()), true);
} else if ( plink.isInsertion() ) {
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
} else {
if ( plink.getLength() > MAX_INDEL_SIZE )
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
char[] deletion = new char[plink.getLength()];
System.arraycopy(ref.getBases(), 1, deletion, 0, plink.getLength());
refAllele = new Allele(new String(deletion), true);
}
} else {
refAllele = new Allele(Character.toString(ref.getBase()), true);
}
return refAllele;
}
public Integer reduce(VCFRecord call, Integer numVariants) { public Integer reduce(VCFRecord call, Integer numVariants) {
if ( call != null ) { if ( call != null ) {
numVariants++; numVariants++;
@ -149,23 +168,8 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
} }
private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) { private VCFRecord addVariantInformationToCall(ReferenceContext ref, VariantContext vContext, Object rod) {
// determine the reference allele
Allele refAllele;
if ( !plinkRod.isIndel() ) {
refAllele = new Allele(Character.toString(ref.getBase()), true);
} else if ( plinkRod.isInsertion() ) {
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
} else {
if ( plinkRod.getLength() > MAX_INDEL_SIZE )
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
char[] deletion = new char[plinkRod.getLength()];
System.arraycopy(ref.getBases(), 1, deletion, 0, plinkRod.getLength());
refAllele = new Allele(new String(deletion), true);
}
VariantContext vContext = VariantContextAdaptors.toVariantContext(plinkRod.getName(), plinkRod, refAllele);
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase()); VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
record.setGenotypeFormatString("GT"); record.setGenotypeFormatString("GT");
@ -208,18 +212,19 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
infoMap.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount())); infoMap.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount()));
record.addInfoFields(infoMap); record.addInfoFields(infoMap);
// add the id // set the id if it's a plink rod
record.setID(plinkRod.getVariantName()); if ( rod instanceof PlinkRod )
record.setID(((PlinkRod)rod).getVariantName());
return record; return record;
} }
private double hardyWeinbergCalculation(VariantContext vc) { private double hardyWeinbergCalculation(VariantContext vc) {
if ( popFile != null ) { //if ( popFile != null ) {
throw new StingException("We still need to implement this!"); // throw new StingException("We still need to implement this!");
} else { //} else {
return VariantContextUtils.computeHardyWeinbergPvalue(vc); return VariantContextUtils.computeHardyWeinbergPvalue(vc);
} //}
} }
// TODO -- REWRITE THIS TO WORK WITH VARIANT CONTEXT // TODO -- REWRITE THIS TO WORK WITH VARIANT CONTEXT
@ -260,23 +265,4 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
} }
*********/ *********/
private HashMap<String,String> parsePopulationFile(File file) {
HashMap<String,String> samplesToPopulation = new HashMap<String,String>();
try {
BufferedReader in = new BufferedReader( new FileReader( file ));
String line = in.readLine();
while ( line != null ) {
String[] populationSamples = line.split("\t");
String population = populationSamples[0];
for ( int i = 1; i < populationSamples.length; i ++ ) {
samplesToPopulation.put(populationSamples[i], population);
}
}
} catch ( IOException e) {
throw new StingException("Error reading population file", e);
}
return samplesToPopulation;
}
} }

View File

@ -9,7 +9,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
@Test @Test
public void testSNPs() { public void testSNPs() {
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt"; String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B input,Plink,"+testPedFile+" -o %s"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("d19f28fdbe3e731522a52c5329777a9f")); Arrays.asList("d19f28fdbe3e731522a52c5329777a9f"));
executeTest("Test SNPs", spec); executeTest("Test SNPs", spec);
@ -18,7 +18,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
@Test @Test
public void testIndels() { public void testIndels() {
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped"; String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B input,Plink,"+testPedFile+" -o %s"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("257fcd5e345f2853813e37b88fbc707c")); Arrays.asList("257fcd5e345f2853813e37b88fbc707c"));
executeTest("Test Indels", spec); executeTest("Test Indels", spec);