diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java index 60e84ae6d..9dee3fece 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java @@ -84,8 +84,10 @@ public class Allele implements Comparable { private byte[] bases = null; + public final static String NULL_ALLELE_STRING = "-"; + public final static String NO_CALL_STRING = "."; /** A generic static NO_CALL allele for use */ - public final static Allele NO_CALL = new Allele("."); + public final static Allele NO_CALL = new Allele(NO_CALL_STRING); /** * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index dd2e64421..821a8d918 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -596,6 +596,11 @@ public class VariantContext { // // --------------------------------------------------------------------------------------------------------- + /** + * @return the number of samples in the context + */ + public int getNSamples() { return genotypes.size(); } + /** * @return true if the context has associated genotypes */ @@ -905,7 +910,7 @@ public class VariantContext { for ( Allele a : alleles ) { for ( Allele b : dest ) { if ( a.basesMatch(b) ) - throw new IllegalArgumentException("Duplicate allele added to VariantContext" + a); + throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a); } dest.add(a); @@ -915,10 +920,10 @@ public class VariantContext { } private static Map genotypeCollectionToMap(Map dest, Collection genotypes) { - for ( Genotype a : genotypes ) { - if ( dest.containsKey(a.getSampleName() ) ) - throw new IllegalArgumentException("Duplicate genotype added to VariantContext " + a); - dest.put(a.getSampleName(), a); + for ( Genotype g : genotypes ) { + if ( dest.containsKey(g.getSampleName() ) ) + throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g); + dest.put(g.getSampleName(), g); } return dest; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java index 84ef7f2fa..249f87092 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java @@ -16,7 +16,7 @@ import java.util.*; public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator { public static final String SEQUENOM_NO_CALL = "0"; - public static final String SEQUENOM_NO_VARIANT = "-"; + public static final String SEQUENOM_NO_BASE = "-"; private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); @@ -27,6 +27,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator sampleNames; + public enum PlinkFileType { STANDARD_PED, RAW_PED, BINARY_PED } @@ -35,10 +37,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator iter) { + public PlinkRod(String name, PlinkVariantInfo record, ListIterator iter, List sampleNames) { super(name); currentVariant = record; variantIterator = iter; + this.sampleNames = sampleNames; } @Override @@ -81,7 +84,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator> getGenotypes() { + return currentVariant.getGenotypes(); + } + + public List getSampleNames() { + return sampleNames; } public boolean isIndel() { @@ -121,6 +127,16 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator seqVars = instantiateVariantListFromHeader(header); ArrayList snpOffsets = getSNPOffsetsFromHeader(header); + sampleNames = new ArrayList(); + String line; do { line = reader.readLine(); @@ -180,6 +198,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator parseBinaryFormattedPlinkFile(File file) { PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); ArrayList parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); - ArrayList sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); - ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile); + sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); + ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants, binaryFiles.bedFile); java.util.Collections.sort(updatedVariants); @@ -294,7 +313,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator getSampleNameOrderingFromFamFile(File famFile) { + private static ArrayList getSampleNameOrderingFromFamFile(File famFile) { BufferedReader reader; try { reader = new BufferedReader( new FileReader( famFile )); @@ -321,7 +340,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator getGenotypesFromBedFile(ArrayList variants, ArrayList samples, File bedFile) { + private ArrayList getGenotypesFromBedFile(ArrayList variants, File bedFile) { FileInputStream inStream; try { inStream = new FileInputStream(bedFile); @@ -342,11 +361,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator 3 ) { - addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode); + addGenotypeByte(genotype,variants,snpOffset,sampleOffset, snpMajorMode); if ( snpMajorMode ) { sampleOffset = sampleOffset + 4; - if ( sampleOffset > samples.size() -1 ) { + if ( sampleOffset > sampleNames.size() -1 ) { snpOffset ++; sampleOffset = 0; } @@ -372,7 +391,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator variants, ArrayList sampleNames, int snpOffset, int sampleOffset, boolean major) { + private void addGenotypeByte(byte genotype, ArrayList variants, int snpOffset, int sampleOffset, boolean major) { // four genotypes encoded in this byte int[] genotypes = parseGenotypes(genotype); for ( int g : genotypes ) { @@ -406,13 +425,12 @@ class PlinkVariantInfo implements Comparable { private String variantName; private GenomeLoc loc; - private HashSet alleles = new HashSet(); - private HashSet genotypes = new HashSet(); + private Map> genotypes = new HashMap>(); // for indels private boolean isIndel = false; private boolean isInsertion = false; - private int indelLength = 0; + private int length = 1; // for binary parsing private String locAllele1; @@ -432,18 +450,22 @@ class PlinkVariantInfo implements Comparable { return variantName; } - public VariantContext getVariantContext() { - try { - return new VariantContext(variantName, loc, alleles, genotypes); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file"); - } + public Map> getGenotypes() { + return genotypes; } public boolean isIndel() { return isIndel; } + public boolean isInsertion() { + return isInsertion; + } + + public int getLength() { + return length; + } + public void setGenomeLoc(GenomeLoc loc) { this.loc = loc; } @@ -463,10 +485,17 @@ class PlinkVariantInfo implements Comparable { String pos = pieces[1].substring(1); loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); - if ( pieces.length > 2 && (pieces[2].equals("gI") || pieces[2].equals("gD")) ) { + if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) { // it's an indel isIndel = true; - isInsertion = pieces[2].equals("gI"); + isInsertion = pieces[2].startsWith("gI"); + try { + // length of insertion on reference is still 1 + if ( !isInsertion ) + length = Integer.parseInt(pieces[2].substring(2)); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])"); + } } } @@ -482,32 +511,16 @@ class PlinkVariantInfo implements Comparable { public void addGenotypeEntry(String[] alleleStrings, String sampleName) { - ArrayList myAlleles = new ArrayList(2); + ArrayList alleles = new ArrayList(2); for ( String alleleString : alleleStrings ) { - if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) ) { - myAlleles.add(Allele.NO_CALL); - } else { - Allele allele; - if ( !isIndel ) { - allele = new Allele(alleleString); - } else if ( alleleString.equals(PlinkRod.SEQUENOM_NO_VARIANT) ) { - allele = new Allele(alleleString, isInsertion); - } else { - allele = new Allele(alleleString, !isInsertion); - if ( indelLength == 0 ) { - indelLength = alleleString.length(); - loc = GenomeLocParser.parseGenomeLoc(loc.getContig(), loc.getStart(), loc.getStart()+indelLength-1); - } - } - - myAlleles.add(allele); - if ( alleles.size() < 2 ) - alleles.add(allele); - } + if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) ) + alleles.add(Allele.NO_CALL_STRING); + else + alleles.add(alleleString); } - genotypes.add(new Genotype(sampleName, myAlleles, 20.0)); + genotypes.put(sampleName, alleles); } public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 333608e2d..e33d1bd4e 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; @@ -36,6 +38,7 @@ public class VariantContextAdaptors { adaptors.put(rodDbSNP.class, new RodDBSnpAdaptor()); adaptors.put(RodVCF.class, new RodVCFAdaptor()); adaptors.put(VCFRecord.class, new VCFRecordAdaptor()); + adaptors.put(PlinkRod.class, new PlinkRodAdaptor()); } public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { @@ -46,6 +49,7 @@ public class VariantContextAdaptors { /** generic superclass */ private static abstract class VCAdaptor { abstract VariantContext convert(String name, Object input); + abstract VariantContext convert(String name, Object input, Allele refAllele); } public static VariantContext toVariantContext(String name, Object variantContainingObject) { @@ -56,9 +60,17 @@ public class VariantContextAdaptors { } } + public static VariantContext toVariantContext(String name, Object variantContainingObject, Allele refAllele) { + if ( ! adaptors.containsKey(variantContainingObject.getClass()) ) + return null; + else { + return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, refAllele); + } + } + // -------------------------------------------------------------------------------------------------------------- // - // From here below you can add adaptor classes for new rods (or other types) to convert to VCF + // From here below you can add adaptor classes for new rods (or other types) to convert to VC // // -------------------------------------------------------------------------------------------------------------- @@ -72,16 +84,17 @@ public class VariantContextAdaptors { private static class RodDBSnpAdaptor extends VCAdaptor { VariantContext convert(String name, Object input) { + if ( ! Allele.acceptableAlleleBases(((rodDbSNP)input).getReference()) ) + return null; + Allele refAllele = new Allele(((rodDbSNP)input).getReference(), true); + return convert(name, input, refAllele); + } + + VariantContext convert(String name, Object input, Allele refAllele) { rodDbSNP dbsnp = (rodDbSNP)input; if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) { // add the reference allele - if ( ! Allele.acceptableAlleleBases(dbsnp.getReference()) ) { - //System.out.printf("Excluding dbsnp record %s%n", dbsnp); - return null; - } - List alleles = new ArrayList(); - Allele refAllele = new Allele(dbsnp.getReference(), true); alleles.add(refAllele); // add all of the alt alleles @@ -112,18 +125,28 @@ public class VariantContextAdaptors { private static class RodVCFAdaptor extends VCAdaptor { VariantContext convert(String name, Object input) { - return vcfToVariantContext(name, ((RodVCF)input).getRecord()); + Allele refAllele = new Allele(((RodVCF)input).getReference(), true); + return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele); + } + + VariantContext convert(String name, Object input, Allele refAllele) { + return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele); } } private static class VCFRecordAdaptor extends VCAdaptor { VariantContext convert(String name, Object input) { - return vcfToVariantContext(name, (VCFRecord)input); + Allele refAllele = new Allele(((VCFRecord)input).getReference(), true); + return vcfToVariantContext(name, (VCFRecord)input, refAllele); + } + + VariantContext convert(String name, Object input, Allele refAllele) { + return vcfToVariantContext(name, (VCFRecord)input, refAllele); } } - private static VariantContext vcfToVariantContext(String name, VCFRecord vcf) { + private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, Allele refAllele) { if ( vcf.isSNP() || vcf.isIndel() ) { // add the reference allele if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) { @@ -137,7 +160,6 @@ public class VariantContextAdaptors { // add all of the alt alleles List alleles = new ArrayList(); - Allele refAllele = new Allele(vcf.getReference(), true); alleles.add(refAllele); for ( String alt : vcf.getAlternateAlleleList() ) { if ( ! Allele.acceptableAlleleBases(alt) ) { @@ -194,9 +216,9 @@ public class VariantContextAdaptors { return new VCFHeader(hInfo == null ? new HashSet() : hInfo, names); } - public static VCFRecord toVCF(VariantContext vc) { + public static VCFRecord toVCF(VariantContext vc, char previousRefBase) { // deal with the reference - String referenceBase = new String(vc.getReference().getBases()); + String referenceBases = new String(vc.getReference().getBases()); String contig = vc.getLocation().getContig(); long position = vc.getLocation().getStart(); @@ -205,15 +227,51 @@ public class VariantContextAdaptors { String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS; - Map alleleMap = new HashMap(); - alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup + // TODO - fixme: temporarily using Strings as keys because Alleles aren't hashing correctly + Map alleleMap = new HashMap(); + alleleMap.put(Allele.NO_CALL.toString(), new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup List vcfAltAlleles = new ArrayList(); for ( Allele a : vc.getAlleles() ) { - VCFGenotypeEncoding encoding = new VCFGenotypeEncoding(new String(a.getBases())); + + VCFGenotypeEncoding encoding; + + // This is tricky because the VCF spec states that the reference must be a single + // character, whereas the reference alleles for deletions of length > 1 are strings. + // To be safe, we require the reference base be passed in and we use that whenever + // we know that the given allele is the reference. + + String alleleString = new String(a.getBases()); + if ( vc.getType() == VariantContext.Type.MIXED ) { + throw new UnsupportedOperationException("Conversion from a mixed type isn't currently supported"); + } else if ( vc.getType() == VariantContext.Type.INDEL ) { + if ( a.isNull() ) { + if ( a.isReference() ) { + // ref, where alt is insertion + encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase)); + } else { + // non-ref deletion + encoding = new VCFGenotypeEncoding("D" + Integer.toString(referenceBases.length())); + } + } else if ( a.isReference() ) { + // ref, where alt is deletion + encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase)); + } else { + // non-ref insertion + encoding = new VCFGenotypeEncoding("I" + alleleString); + } + } else if ( vc.getType() == VariantContext.Type.NO_VARIATION ) { + // ref + encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase)); + } else { + // ref or alt for snp + encoding = new VCFGenotypeEncoding(alleleString); + } + if ( a.isNonReference() ) { vcfAltAlleles.add(encoding); } - alleleMap.put(a, encoding); + + alleleMap.put(a.toString(), encoding); } List vcfGenotypeAttributeKeys = new ArrayList(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY)); @@ -225,7 +283,7 @@ public class VariantContextAdaptors { for ( Genotype g : vc.getGenotypesSortedByName() ) { List encodings = new ArrayList(g.getPloidy()); for ( Allele a : g.getAlleles() ) { - encodings.add(alleleMap.get(a)); + encodings.add(alleleMap.get(a.toString())); } VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED; @@ -240,7 +298,6 @@ public class VariantContextAdaptors { genotypeObjects.add(vcfG); } - // info fields Map infoFields = new HashMap(); for ( Map.Entry elt : vc.getAttributes().entrySet() ) { @@ -251,7 +308,7 @@ public class VariantContextAdaptors { } } - return new VCFRecord(referenceBase, contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects); + return new VCFRecord(Character.toString(previousRefBase), contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects); } private static List calcVCFGenotypeKeys(VariantContext vc) { @@ -266,4 +323,66 @@ public class VariantContextAdaptors { keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY); return Utils.sorted(new ArrayList(keys)); } + + // -------------------------------------------------------------------------------------------------------------- + // + // Plink to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- + + private static class PlinkRodAdaptor extends VCAdaptor { + VariantContext convert(String name, Object input) { + throw new UnsupportedOperationException("Conversion from Plink to VC requires a reference allele"); + } + + VariantContext convert(String name, Object input, Allele refAllele) { + PlinkRod plink = (PlinkRod)input; + + HashMap alleles = new HashMap(); // use String keys to help maintain uniqueness + alleles.put(refAllele.isNull() ? Allele.NULL_ALLELE_STRING : new String(refAllele.getBases()), refAllele); + + Set genotypes = new HashSet(); + + Map> genotypeSets = plink.getGenotypes(); + // for each sample + for ( Map.Entry> genotype : genotypeSets.entrySet() ) { + ArrayList myAlleles = new ArrayList(2); + + // for each allele + for ( String alleleString : genotype.getValue() ) { + Allele allele; + if ( alleleString.equals(Allele.NO_CALL_STRING) ) { + allele = Allele.NO_CALL; + } else { + if ( !plink.isIndel() ) { + allele = new Allele(alleleString, refAllele.basesMatch(alleleString)); + } else if ( alleleString.equals(Allele.NULL_ALLELE_STRING) ) { + allele = new Allele(alleleString, plink.isInsertion()); + } else { + allele = new Allele(alleleString, !plink.isInsertion()); + } + + if ( !alleles.containsKey(alleleString) ) + alleles.put(alleleString, allele); + } + + myAlleles.add(allele); + } + + // create the genotype + genotypes.add(new Genotype(genotype.getKey(), myAlleles)); + } + + // create the variant context + try { + GenomeLoc loc = GenomeLocParser.setStop(plink.getLocation(), plink.getLocation().getStop() + plink.getLength()-1); + VariantContext vc = new VariantContext(plink.getVariantName(), loc, alleles.values(), genotypes); + vc.validate(); + return vc; + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file"); + } + } + } + } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java index f12a4dbe9..e9bd120ff 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java @@ -5,15 +5,10 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.EnumSet; -import java.util.Set; -import java.util.HashSet; -import java.util.List; import java.io.File; /** @@ -29,7 +24,7 @@ public class TestVariantContextWalker extends RodWalker { @Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false) boolean onlyContextsStartinAtCurrentPosition = false; - @Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false) + @Argument(fullName="printPerLocus", doc="If true, we'll psetenv LD_LIBRARY_PATH .:/util/gcc-4.3.0/lib64:/util/gcc-4.3.0/lib/gcc/x86_64-unknown-linux-gnu/4.3.0:/util/gcc-4.3.0/lib/:/util/lib:/broad/tools/Linux/x86_64/pkgs/boost_1.38.0/lib:/humgen/gsa-scr1/GATK_Data/bwarint the variant contexts, in addition to counts", required=false) boolean printContexts = false; @Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false) @@ -57,7 +52,7 @@ public class TestVariantContextWalker extends RodWalker { wroteHeader = true; } - writer.addRecord(VariantContextAdaptors.toVCF(vc)); + writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase())); } n++; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java index 4d2413f44..ec1717457 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -396,13 +396,13 @@ public class VariantEval2Walker extends RodWalker { } if ( group.enableInterestingSiteCaptures && captureInterestingSitesOfEvalSet(group) ) - writeInterestingSite(interestingReasons, vc); + writeInterestingSite(interestingReasons, vc, ref.getBase()); } return 0; } - private void writeInterestingSite(List interestingReasons, VariantContext vc) { + private void writeInterestingSite(List interestingReasons, VariantContext vc, char ref) { if ( writer != null && interestingReasons.size() > 0 ) { MutableVariantContext mvc = new MutableVariantContext(vc); @@ -431,7 +431,7 @@ public class VariantEval2Walker extends RodWalker { wroteHeader = true; } - writer.addRecord(VariantContextAdaptors.toVCF(mvc)); + writer.addRecord(VariantContextAdaptors.toVCF(mvc, ref)); //interestingReasons.clear(); } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java index 4eb287e88..1b4748242 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java @@ -10,19 +10,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; -import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader; -import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator; import java.util.*; -import java.util.regex.Pattern; -import java.util.regex.Matcher; -import java.io.File; -import java.io.FileNotFoundException; /** * Test routine for new VariantContext object @@ -68,6 +60,9 @@ public class BeagleTrioToVCFWalker extends RodWalker { } } + if ( vc != null ) + writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase())); + return vc; } @@ -121,13 +116,6 @@ public class BeagleTrioToVCFWalker extends RodWalker { } public Long reduce(VariantContext vc, Long prevReduce) { - if ( vc == null ) { - return prevReduce; - } else { - writer.addRecord(VariantContextAdaptors.toVCF(vc)); - prevReduce++; - } - - return prevReduce; + return ( vc == null ? prevReduce : prevReduce+1); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java index 4ec5104f3..69ad0cdff 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java @@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.refdata.PlinkRod; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.genotype.vcf.*; @@ -22,6 +24,7 @@ import java.util.*; /** * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes) */ +@Reference(window=@Window(start=0,stop=40)) public class PlinkToVCF extends RodWalker { @Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true) public File vcfFile = null; @@ -39,27 +42,19 @@ public class PlinkToVCF extends RodWalker { @Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false) public double maxHomNonref = 1.1; - private final Set HEADER_FIELDS = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype", - "FID","IID","PAT","MAT","SEX","PHENOTYPE")); + private static final int MAX_INDEL_SIZE = 40; + private final int INIT_NUMBER_OF_POPULATIONS = 10; - private final int DEFAULT_QUALITY = 20; private ArrayList sampleNames = new ArrayList(nSamples); - private VCFGenotypeWriterAdapter vcfWriter; + private VCFWriter vcfWriter = null; private final HardyWeinberg HWCalc = new HardyWeinberg(); private final boolean useSmartHardy = popFile != null; private HashMap samplesToPopulation; public void initialize() { - vcfWriter = new VCFGenotypeWriterAdapter(vcfFile); if ( useSmartHardy ) { samplesToPopulation = parsePopulationFile(popFile); } - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "PlinkToVCF")); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - vcfWriter.writeHeader(new TreeSet(sampleNames), hInfo); - nSamples = sampleNames.size(); } public Integer reduceInit() { @@ -85,50 +80,69 @@ public class PlinkToVCF extends RodWalker { if ( plinkRod == null ) return null; + if ( vcfWriter == null ) + initializeWriter(plinkRod); + return addVariantInformationToCall(ref, plinkRod); } + private void initializeWriter(PlinkRod plinkRod) { + vcfWriter = new VCFWriter(vcfFile); + + Set hInfo = new HashSet(); + hInfo.add(new VCFHeaderLine("source", "PlinkToVCF")); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + VCFHeader header = new VCFHeader(hInfo, new TreeSet(plinkRod.getSampleNames())); + vcfWriter.writeHeader(header); + + nSamples = sampleNames.size(); + + } + public Integer reduce(VCFRecord call, Integer numVariants) { if ( call != null ) { numVariants++; - printToVCF(call); + vcfWriter.addRecord(call); } return numVariants; } public void onTraversalDone(Integer finalReduce) { logger.info("Variants processed="+finalReduce.toString()); - vcfWriter.close(); + if ( vcfWriter != null ) + vcfWriter.close(); } - private void printToVCF(VCFRecord call) { - try { - vcfWriter.addRecord(call); - } catch ( RuntimeException e ) { - if ( e.getLocalizedMessage().equalsIgnoreCase("We have more genotype samples than the header specified")) { - throw new StingException("We have more sample genotypes than sample names -- check that there are no duplicates in the .ped file",e); - } else { - throw new StingException("Error in VCF creation: "+e.getLocalizedMessage(),e); - } - } - } private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) { - VariantContext vContext = plinkRod.getVariantContext(); - VCFRecord record = VariantContextAdaptors.toVCF(vContext); + // determine the reference allele + Allele refAllele; + if ( !plinkRod.isIndel() ) { + refAllele = new Allele(Character.toString(ref.getBase()), true); + } else if ( plinkRod.isInsertion() ) { + refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true); + } else { + if ( plinkRod.getLength() > MAX_INDEL_SIZE ) + throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE); + char[] deletion = new char[plinkRod.getLength()]; + System.arraycopy(ref.getBases(), 1, deletion, 0, plinkRod.getLength()); + refAllele = new Allele(new String(deletion), true); + } + + VariantContext vContext = VariantContextAdaptors.toVariantContext(plinkRod.getName(), plinkRod, refAllele); + VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase()); record.setGenotypeFormatString("GT"); - int numNoCalls = vContext.getNoCallCount(); - int numHomVarCalls = vContext.getHomVarCount(); + if ( true ) + return record; + int numHetCalls = vContext.getHetCount(); + double noCallProp = (double)vContext.getNoCallCount() / (double)vContext.getNSamples(); + double homVarProp = (double)vContext.getHomVarCount() / (double)vContext.getNSamples(); - - double noCallProp = ( (double) numNoCalls )/( (double) sampleNames.size()); - double homNonRProp = ( (double) numHomVarCalls )/( (double) sampleNames.size() - numNoCalls); - - record.setQual(DEFAULT_QUALITY); String hw = hardyWeinbergCalculation(ref,record); double hwScore = hw != null ? Double.valueOf(hw) : -0.0; // TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore)); @@ -136,7 +150,7 @@ public class PlinkToVCF extends RodWalker { record.setFilterString("Hardy-Weinberg"); } else if ( noCallProp > maxNoCall ) { record.setFilterString("No-calls"); - } else if ( homNonRProp > maxHomNonref) { + } else if ( homVarProp > maxHomNonref) { record.setFilterString("HomNonref-calls"); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 24866b11f..d170c0a86 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -582,7 +582,10 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { builder.append(EMPTY_ALLELE_FIELD); } builder.append(FIELD_SEPERATOR); - builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual())); + if ( MathUtils.compareDoubles(mQual, -1.0) == 0 ) + builder.append("-1"); + else + builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual)); builder.append(FIELD_SEPERATOR); builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes())); builder.append(FIELD_SEPERATOR); @@ -623,12 +626,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { * @param header the header object */ private void addGenotypeData(StringBuilder builder, VCFHeader header) { + Map gMap = genotypeListToMap(getGenotypes()); + StringBuffer tempStr = new StringBuffer(); - if ( header.getGenotypeSamples().size() < getGenotypes().size() ) - throw new IllegalStateException("We have more genotype samples than the header specified"); + if ( header.getGenotypeSamples().size() < getGenotypes().size() ) { + for ( String sample : gMap.keySet() ) { + if ( !header.getGenotypeSamples().contains(sample) ) + System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header"); + else + header.getGenotypeSamples().remove(sample); + } + throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated"); + } tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString); - Map gMap = genotypeListToMap(getGenotypes()); String[] genotypeFormatStrings = mGenotypeFormatString.split(":"); for ( String genotype : header.getGenotypeSamples() ) { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index bf8f4a6b9..bc5ac2ea9 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -7,7 +7,7 @@ import java.util.Arrays; public class IndelRealignerIntegrationTest extends WalkerTest { @Test - public void testIntervals() { + public void testRealigner() { String[] md5lod5 = {"67c3fc25e9d192cc5fbfd48ade0efc84", "86778f92b0fa6aa7c26e651c8c1eb320"}; WalkerTestSpec spec1 = new WalkerTestSpec( @@ -23,4 +23,4 @@ public class IndelRealignerIntegrationTest extends WalkerTest { Arrays.asList(md5lod200)); executeTest("testLod200", spec2); } -} +} \ No newline at end of file