Generalizing the SequenomValidationConverter to be able to take in any arbitrary rod type (provided it can be converted to VariantContext).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3155 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4bb8984f80
commit
e413882302
|
|
@ -8,25 +8,19 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
|
||||||
import org.broadinstitute.sting.gatk.refdata.PlinkRod;
|
import org.broadinstitute.sting.gatk.refdata.PlinkRod;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
|
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
|
||||||
*/
|
*/
|
||||||
@Reference(window=@Window(start=0,stop=40))
|
@Reference(window=@Window(start=0,stop=40))
|
||||||
|
@Requires(value={},referenceMetaData=@RMD(name="sequenom",type= ReferenceOrderedDatum.class))
|
||||||
public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
|
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
|
||||||
protected double maxHardy = 20.0;
|
protected double maxHardy = 20.0;
|
||||||
|
|
@ -35,9 +29,9 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
|
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
|
||||||
protected double maxHomNonref = 1.1;
|
protected double maxHomNonref = 1.1;
|
||||||
|
|
||||||
@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
|
//@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
|
||||||
"used for smart Hardy-Weinberg annotation",required = false)
|
// "used for smart Hardy-Weinberg annotation",required = false)
|
||||||
public File popFile = null;
|
//private File popFile = null;
|
||||||
|
|
||||||
// max allowable indel size (based on ref window)
|
// max allowable indel size (based on ref window)
|
||||||
private static final int MAX_INDEL_SIZE = 40;
|
private static final int MAX_INDEL_SIZE = 40;
|
||||||
|
|
@ -55,12 +49,12 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
private int numHomVarViolations = 0;
|
private int numHomVarViolations = 0;
|
||||||
private int numTrueVariants = 0;
|
private int numTrueVariants = 0;
|
||||||
|
|
||||||
private HashMap<String,String> samplesToPopulation;
|
//private HashMap<String,String> samplesToPopulation;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if ( popFile != null ) {
|
//if ( popFile != null ) {
|
||||||
samplesToPopulation = parsePopulationFile(popFile);
|
// samplesToPopulation = parsePopulationFile(popFile);
|
||||||
}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
@ -72,26 +66,51 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
if ( tracker == null )
|
if ( tracker == null )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
// get the Plink rod at this locus if there is one
|
// get the sequenom rod at this locus if there is one
|
||||||
PlinkRod plinkRod = null;
|
List<Object> rods = tracker.getReferenceMetaData("sequenom");
|
||||||
Iterator<GATKFeature> rods = tracker.getAllRods().iterator();
|
// ignore places where we don't have a variant
|
||||||
while (rods.hasNext()) {
|
if ( rods.size() == 0 )
|
||||||
Object rod = rods.next().getUnderlyingObject();
|
|
||||||
if ( rod instanceof PlinkRod ) {
|
|
||||||
plinkRod = (PlinkRod)rod;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( plinkRod == null )
|
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
if ( sampleNames == null )
|
Object rod = rods.get(0);
|
||||||
sampleNames = new TreeSet<String>(plinkRod.getSampleNames());
|
|
||||||
|
|
||||||
return addVariantInformationToCall(ref, plinkRod);
|
// determine the reference allele
|
||||||
|
Allele refAllele = determineRefAllele(rod, ref);
|
||||||
|
|
||||||
|
VariantContext vc = VariantContextAdaptors.toVariantContext("sequenom", rod, refAllele);
|
||||||
|
|
||||||
|
if ( sampleNames == null )
|
||||||
|
sampleNames = new TreeSet<String>(vc.getSampleNames());
|
||||||
|
|
||||||
|
return addVariantInformationToCall(ref, vc, rod);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Allele determineRefAllele(Object rod, ReferenceContext ref) {
|
||||||
|
Allele refAllele;
|
||||||
|
|
||||||
|
// ugly hack to get around the fact that the Plink rod needs
|
||||||
|
// a very specific determination of the reference allele
|
||||||
|
if ( rod instanceof PlinkRod ) {
|
||||||
|
PlinkRod plink = (PlinkRod)rod;
|
||||||
|
if ( !plink.isIndel() ) {
|
||||||
|
refAllele = new Allele(Character.toString(ref.getBase()), true);
|
||||||
|
} else if ( plink.isInsertion() ) {
|
||||||
|
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
|
||||||
|
} else {
|
||||||
|
if ( plink.getLength() > MAX_INDEL_SIZE )
|
||||||
|
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
|
||||||
|
char[] deletion = new char[plink.getLength()];
|
||||||
|
System.arraycopy(ref.getBases(), 1, deletion, 0, plink.getLength());
|
||||||
|
refAllele = new Allele(new String(deletion), true);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
refAllele = new Allele(Character.toString(ref.getBase()), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
return refAllele;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public Integer reduce(VCFRecord call, Integer numVariants) {
|
public Integer reduce(VCFRecord call, Integer numVariants) {
|
||||||
if ( call != null ) {
|
if ( call != null ) {
|
||||||
numVariants++;
|
numVariants++;
|
||||||
|
|
@ -149,23 +168,8 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) {
|
private VCFRecord addVariantInformationToCall(ReferenceContext ref, VariantContext vContext, Object rod) {
|
||||||
|
|
||||||
// determine the reference allele
|
|
||||||
Allele refAllele;
|
|
||||||
if ( !plinkRod.isIndel() ) {
|
|
||||||
refAllele = new Allele(Character.toString(ref.getBase()), true);
|
|
||||||
} else if ( plinkRod.isInsertion() ) {
|
|
||||||
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
|
|
||||||
} else {
|
|
||||||
if ( plinkRod.getLength() > MAX_INDEL_SIZE )
|
|
||||||
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
|
|
||||||
char[] deletion = new char[plinkRod.getLength()];
|
|
||||||
System.arraycopy(ref.getBases(), 1, deletion, 0, plinkRod.getLength());
|
|
||||||
refAllele = new Allele(new String(deletion), true);
|
|
||||||
}
|
|
||||||
|
|
||||||
VariantContext vContext = VariantContextAdaptors.toVariantContext(plinkRod.getName(), plinkRod, refAllele);
|
|
||||||
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
|
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
|
||||||
record.setGenotypeFormatString("GT");
|
record.setGenotypeFormatString("GT");
|
||||||
|
|
||||||
|
|
@ -208,18 +212,19 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
infoMap.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount()));
|
infoMap.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount()));
|
||||||
record.addInfoFields(infoMap);
|
record.addInfoFields(infoMap);
|
||||||
|
|
||||||
// add the id
|
// set the id if it's a plink rod
|
||||||
record.setID(plinkRod.getVariantName());
|
if ( rod instanceof PlinkRod )
|
||||||
|
record.setID(((PlinkRod)rod).getVariantName());
|
||||||
|
|
||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double hardyWeinbergCalculation(VariantContext vc) {
|
private double hardyWeinbergCalculation(VariantContext vc) {
|
||||||
if ( popFile != null ) {
|
//if ( popFile != null ) {
|
||||||
throw new StingException("We still need to implement this!");
|
// throw new StingException("We still need to implement this!");
|
||||||
} else {
|
//} else {
|
||||||
return VariantContextUtils.computeHardyWeinbergPvalue(vc);
|
return VariantContextUtils.computeHardyWeinbergPvalue(vc);
|
||||||
}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO -- REWRITE THIS TO WORK WITH VARIANT CONTEXT
|
// TODO -- REWRITE THIS TO WORK WITH VARIANT CONTEXT
|
||||||
|
|
@ -260,23 +265,4 @@ public class SequenomValidationConverter extends RodWalker<VCFRecord,Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
*********/
|
*********/
|
||||||
|
|
||||||
private HashMap<String,String> parsePopulationFile(File file) {
|
|
||||||
HashMap<String,String> samplesToPopulation = new HashMap<String,String>();
|
|
||||||
try {
|
|
||||||
BufferedReader in = new BufferedReader( new FileReader( file ));
|
|
||||||
String line = in.readLine();
|
|
||||||
while ( line != null ) {
|
|
||||||
String[] populationSamples = line.split("\t");
|
|
||||||
String population = populationSamples[0];
|
|
||||||
for ( int i = 1; i < populationSamples.length; i ++ ) {
|
|
||||||
samplesToPopulation.put(populationSamples[i], population);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch ( IOException e) {
|
|
||||||
throw new StingException("Error reading population file", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
return samplesToPopulation;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testSNPs() {
|
public void testSNPs() {
|
||||||
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
|
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
|
||||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B input,Plink,"+testPedFile+" -o %s";
|
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
Arrays.asList("d19f28fdbe3e731522a52c5329777a9f"));
|
Arrays.asList("d19f28fdbe3e731522a52c5329777a9f"));
|
||||||
executeTest("Test SNPs", spec);
|
executeTest("Test SNPs", spec);
|
||||||
|
|
@ -18,7 +18,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testIndels() {
|
public void testIndels() {
|
||||||
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
|
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
|
||||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B input,Plink,"+testPedFile+" -o %s";
|
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
Arrays.asList("257fcd5e345f2853813e37b88fbc707c"));
|
Arrays.asList("257fcd5e345f2853813e37b88fbc707c"));
|
||||||
executeTest("Test Indels", spec);
|
executeTest("Test Indels", spec);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue