Various improvements to plink, variant context, and VCF code.

We almost completely support indels. Not yet done with plink stuff.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2926 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-04 17:58:01 +00:00
parent c8077b7a22
commit 0dd65461a1
10 changed files with 285 additions and 138 deletions

View File

@ -84,8 +84,10 @@ public class Allele implements Comparable<Allele> {
private byte[] bases = null; private byte[] bases = null;
public final static String NULL_ALLELE_STRING = "-";
public final static String NO_CALL_STRING = ".";
/** A generic static NO_CALL allele for use */ /** A generic static NO_CALL allele for use */
public final static Allele NO_CALL = new Allele("."); public final static Allele NO_CALL = new Allele(NO_CALL_STRING);
/** /**
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases

View File

@ -596,6 +596,11 @@ public class VariantContext {
// //
// --------------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------------
/**
* @return the number of samples in the context
*/
public int getNSamples() { return genotypes.size(); }
/** /**
* @return true if the context has associated genotypes * @return true if the context has associated genotypes
*/ */
@ -905,7 +910,7 @@ public class VariantContext {
for ( Allele a : alleles ) { for ( Allele a : alleles ) {
for ( Allele b : dest ) { for ( Allele b : dest ) {
if ( a.basesMatch(b) ) if ( a.basesMatch(b) )
throw new IllegalArgumentException("Duplicate allele added to VariantContext" + a); throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a);
} }
dest.add(a); dest.add(a);
@ -915,10 +920,10 @@ public class VariantContext {
} }
private static Map<String, Genotype> genotypeCollectionToMap(Map<String, Genotype> dest, Collection<Genotype> genotypes) { private static Map<String, Genotype> genotypeCollectionToMap(Map<String, Genotype> dest, Collection<Genotype> genotypes) {
for ( Genotype a : genotypes ) { for ( Genotype g : genotypes ) {
if ( dest.containsKey(a.getSampleName() ) ) if ( dest.containsKey(g.getSampleName() ) )
throw new IllegalArgumentException("Duplicate genotype added to VariantContext " + a); throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g);
dest.put(a.getSampleName(), a); dest.put(g.getSampleName(), g);
} }
return dest; return dest;

View File

@ -16,7 +16,7 @@ import java.util.*;
public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<PlinkRod> { public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<PlinkRod> {
public static final String SEQUENOM_NO_CALL = "0"; public static final String SEQUENOM_NO_CALL = "0";
public static final String SEQUENOM_NO_VARIANT = "-"; public static final String SEQUENOM_NO_BASE = "-";
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex", private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
@ -27,6 +27,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
private PlinkFileType plinkFileType; private PlinkFileType plinkFileType;
private List<String> sampleNames;
public enum PlinkFileType { public enum PlinkFileType {
STANDARD_PED, RAW_PED, BINARY_PED STANDARD_PED, RAW_PED, BINARY_PED
} }
@ -35,10 +37,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
super(name); super(name);
} }
public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter) { public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter, List<String> sampleNames) {
super(name); super(name);
currentVariant = record; currentVariant = record;
variantIterator = iter; variantIterator = iter;
this.sampleNames = sampleNames;
} }
@Override @Override
@ -81,7 +84,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
// get the next record // get the next record
currentVariant = variantIterator.next(); currentVariant = variantIterator.next();
return new PlinkRod(name, currentVariant, variantIterator); return new PlinkRod(name, currentVariant, variantIterator, sampleNames);
} }
@Override @Override
@ -111,9 +114,12 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
} }
public VariantContext getVariantContext() { public Map<String, List<String>> getGenotypes() {
assertNotNull(); return currentVariant.getGenotypes();
return currentVariant.getVariantContext(); }
public List<String> getSampleNames() {
return sampleNames;
} }
public boolean isIndel() { public boolean isIndel() {
@ -121,6 +127,16 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return currentVariant.isIndel(); return currentVariant.isIndel();
} }
public boolean isInsertion() {
assertNotNull();
return currentVariant.isInsertion();
}
public int getLength() {
assertNotNull();
return currentVariant.getLength();
}
// AM I PARSING A TEXT OR A BINARY FILE ?? // AM I PARSING A TEXT OR A BINARY FILE ??
@ -150,6 +166,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header); ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header);
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header); ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
sampleNames = new ArrayList<String>();
String line; String line;
do { do {
line = reader.readLine(); line = reader.readLine();
@ -180,6 +198,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
plinkInfo = plinkLine.split("\t"); plinkInfo = plinkLine.split("\t");
String individualName = plinkInfo[1]; String individualName = plinkInfo[1];
sampleNames.add(individualName);
int snpNumber = 0; int snpNumber = 0;
for ( int i : offsets ) { for ( int i : offsets ) {
@ -237,8 +256,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
private ArrayList<PlinkVariantInfo> parseBinaryFormattedPlinkFile(File file) { private ArrayList<PlinkVariantInfo> parseBinaryFormattedPlinkFile(File file) {
PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file);
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
ArrayList<String> sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile); ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants, binaryFiles.bedFile);
java.util.Collections.sort(updatedVariants); java.util.Collections.sort(updatedVariants);
@ -294,7 +313,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return variants; return variants;
} }
private ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) { private static ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
BufferedReader reader; BufferedReader reader;
try { try {
reader = new BufferedReader( new FileReader( famFile )); reader = new BufferedReader( new FileReader( famFile ));
@ -321,7 +340,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return sampleNames; return sampleNames;
} }
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, ArrayList<String> samples, File bedFile) { private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, File bedFile) {
FileInputStream inStream; FileInputStream inStream;
try { try {
inStream = new FileInputStream(bedFile); inStream = new FileInputStream(bedFile);
@ -342,11 +361,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
bytesRead++; bytesRead++;
if ( genotype != -1 ) { if ( genotype != -1 ) {
if ( bytesRead > 3 ) { if ( bytesRead > 3 ) {
addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode); addGenotypeByte(genotype,variants,snpOffset,sampleOffset, snpMajorMode);
if ( snpMajorMode ) { if ( snpMajorMode ) {
sampleOffset = sampleOffset + 4; sampleOffset = sampleOffset + 4;
if ( sampleOffset > samples.size() -1 ) { if ( sampleOffset > sampleNames.size() -1 ) {
snpOffset ++; snpOffset ++;
sampleOffset = 0; sampleOffset = 0;
} }
@ -372,7 +391,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return variants; return variants;
} }
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, ArrayList<String> sampleNames, int snpOffset, int sampleOffset, boolean major) { private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, int snpOffset, int sampleOffset, boolean major) {
// four genotypes encoded in this byte // four genotypes encoded in this byte
int[] genotypes = parseGenotypes(genotype); int[] genotypes = parseGenotypes(genotype);
for ( int g : genotypes ) { for ( int g : genotypes ) {
@ -406,13 +425,12 @@ class PlinkVariantInfo implements Comparable {
private String variantName; private String variantName;
private GenomeLoc loc; private GenomeLoc loc;
private HashSet<Allele> alleles = new HashSet<Allele>(); private Map<String, List<String>> genotypes = new HashMap<String, List<String>>();
private HashSet<Genotype> genotypes = new HashSet<Genotype>();
// for indels // for indels
private boolean isIndel = false; private boolean isIndel = false;
private boolean isInsertion = false; private boolean isInsertion = false;
private int indelLength = 0; private int length = 1;
// for binary parsing // for binary parsing
private String locAllele1; private String locAllele1;
@ -432,18 +450,22 @@ class PlinkVariantInfo implements Comparable {
return variantName; return variantName;
} }
public VariantContext getVariantContext() { public Map<String, List<String>> getGenotypes() {
try { return genotypes;
return new VariantContext(variantName, loc, alleles, genotypes);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
}
} }
public boolean isIndel() { public boolean isIndel() {
return isIndel; return isIndel;
} }
public boolean isInsertion() {
return isInsertion;
}
public int getLength() {
return length;
}
public void setGenomeLoc(GenomeLoc loc) { public void setGenomeLoc(GenomeLoc loc) {
this.loc = loc; this.loc = loc;
} }
@ -463,10 +485,17 @@ class PlinkVariantInfo implements Comparable {
String pos = pieces[1].substring(1); String pos = pieces[1].substring(1);
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
if ( pieces.length > 2 && (pieces[2].equals("gI") || pieces[2].equals("gD")) ) { if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {
// it's an indel // it's an indel
isIndel = true; isIndel = true;
isInsertion = pieces[2].equals("gI"); isInsertion = pieces[2].startsWith("gI");
try {
// length of insertion on reference is still 1
if ( !isInsertion )
length = Integer.parseInt(pieces[2].substring(2));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])");
}
} }
} }
@ -482,32 +511,16 @@ class PlinkVariantInfo implements Comparable {
public void addGenotypeEntry(String[] alleleStrings, String sampleName) { public void addGenotypeEntry(String[] alleleStrings, String sampleName) {
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2); ArrayList<String> alleles = new ArrayList<String>(2);
for ( String alleleString : alleleStrings ) { for ( String alleleString : alleleStrings ) {
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) ) { if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) )
myAlleles.add(Allele.NO_CALL); alleles.add(Allele.NO_CALL_STRING);
} else { else
Allele allele; alleles.add(alleleString);
if ( !isIndel ) {
allele = new Allele(alleleString);
} else if ( alleleString.equals(PlinkRod.SEQUENOM_NO_VARIANT) ) {
allele = new Allele(alleleString, isInsertion);
} else {
allele = new Allele(alleleString, !isInsertion);
if ( indelLength == 0 ) {
indelLength = alleleString.length();
loc = GenomeLocParser.parseGenomeLoc(loc.getContig(), loc.getStart(), loc.getStart()+indelLength-1);
}
}
myAlleles.add(allele);
if ( alleles.size() < 2 )
alleles.add(allele);
}
} }
genotypes.add(new Genotype(sampleName, myAlleles, 20.0)); genotypes.put(sampleName, alleles);
} }
public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) {

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
@ -36,6 +38,7 @@ public class VariantContextAdaptors {
adaptors.put(rodDbSNP.class, new RodDBSnpAdaptor()); adaptors.put(rodDbSNP.class, new RodDBSnpAdaptor());
adaptors.put(RodVCF.class, new RodVCFAdaptor()); adaptors.put(RodVCF.class, new RodVCFAdaptor());
adaptors.put(VCFRecord.class, new VCFRecordAdaptor()); adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
} }
public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { public static boolean canBeConvertedToVariantContext(Object variantContainingObject) {
@ -46,6 +49,7 @@ public class VariantContextAdaptors {
/** generic superclass */ /** generic superclass */
private static abstract class VCAdaptor { private static abstract class VCAdaptor {
abstract VariantContext convert(String name, Object input); abstract VariantContext convert(String name, Object input);
abstract VariantContext convert(String name, Object input, Allele refAllele);
} }
public static VariantContext toVariantContext(String name, Object variantContainingObject) { public static VariantContext toVariantContext(String name, Object variantContainingObject) {
@ -56,9 +60,17 @@ public class VariantContextAdaptors {
} }
} }
public static VariantContext toVariantContext(String name, Object variantContainingObject, Allele refAllele) {
if ( ! adaptors.containsKey(variantContainingObject.getClass()) )
return null;
else {
return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, refAllele);
}
}
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// From here below you can add adaptor classes for new rods (or other types) to convert to VCF // From here below you can add adaptor classes for new rods (or other types) to convert to VC
// //
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
@ -72,16 +84,17 @@ public class VariantContextAdaptors {
private static class RodDBSnpAdaptor extends VCAdaptor { private static class RodDBSnpAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) { VariantContext convert(String name, Object input) {
if ( ! Allele.acceptableAlleleBases(((rodDbSNP)input).getReference()) )
return null;
Allele refAllele = new Allele(((rodDbSNP)input).getReference(), true);
return convert(name, input, refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
rodDbSNP dbsnp = (rodDbSNP)input; rodDbSNP dbsnp = (rodDbSNP)input;
if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) { if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) {
// add the reference allele // add the reference allele
if ( ! Allele.acceptableAlleleBases(dbsnp.getReference()) ) {
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
return null;
}
List<Allele> alleles = new ArrayList<Allele>(); List<Allele> alleles = new ArrayList<Allele>();
Allele refAllele = new Allele(dbsnp.getReference(), true);
alleles.add(refAllele); alleles.add(refAllele);
// add all of the alt alleles // add all of the alt alleles
@ -112,18 +125,28 @@ public class VariantContextAdaptors {
private static class RodVCFAdaptor extends VCAdaptor { private static class RodVCFAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) { VariantContext convert(String name, Object input) {
return vcfToVariantContext(name, ((RodVCF)input).getRecord()); Allele refAllele = new Allele(((RodVCF)input).getReference(), true);
return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele);
} }
} }
private static class VCFRecordAdaptor extends VCAdaptor { private static class VCFRecordAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) { VariantContext convert(String name, Object input) {
return vcfToVariantContext(name, (VCFRecord)input); Allele refAllele = new Allele(((VCFRecord)input).getReference(), true);
return vcfToVariantContext(name, (VCFRecord)input, refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
return vcfToVariantContext(name, (VCFRecord)input, refAllele);
} }
} }
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf) { private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, Allele refAllele) {
if ( vcf.isSNP() || vcf.isIndel() ) { if ( vcf.isSNP() || vcf.isIndel() ) {
// add the reference allele // add the reference allele
if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) { if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) {
@ -137,7 +160,6 @@ public class VariantContextAdaptors {
// add all of the alt alleles // add all of the alt alleles
List<Allele> alleles = new ArrayList<Allele>(); List<Allele> alleles = new ArrayList<Allele>();
Allele refAllele = new Allele(vcf.getReference(), true);
alleles.add(refAllele); alleles.add(refAllele);
for ( String alt : vcf.getAlternateAlleleList() ) { for ( String alt : vcf.getAlternateAlleleList() ) {
if ( ! Allele.acceptableAlleleBases(alt) ) { if ( ! Allele.acceptableAlleleBases(alt) ) {
@ -194,9 +216,9 @@ public class VariantContextAdaptors {
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names); return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
} }
public static VCFRecord toVCF(VariantContext vc) { public static VCFRecord toVCF(VariantContext vc, char previousRefBase) {
// deal with the reference // deal with the reference
String referenceBase = new String(vc.getReference().getBases()); String referenceBases = new String(vc.getReference().getBases());
String contig = vc.getLocation().getContig(); String contig = vc.getLocation().getContig();
long position = vc.getLocation().getStart(); long position = vc.getLocation().getStart();
@ -205,15 +227,51 @@ public class VariantContextAdaptors {
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS; String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS;
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>(); // TODO - fixme: temporarily using Strings as keys because Alleles aren't hashing correctly
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup Map<String, VCFGenotypeEncoding> alleleMap = new HashMap<String, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL.toString(), new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>(); List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
for ( Allele a : vc.getAlleles() ) { for ( Allele a : vc.getAlleles() ) {
VCFGenotypeEncoding encoding = new VCFGenotypeEncoding(new String(a.getBases()));
VCFGenotypeEncoding encoding;
// This is tricky because the VCF spec states that the reference must be a single
// character, whereas the reference alleles for deletions of length > 1 are strings.
// To be safe, we require the reference base be passed in and we use that whenever
// we know that the given allele is the reference.
String alleleString = new String(a.getBases());
if ( vc.getType() == VariantContext.Type.MIXED ) {
throw new UnsupportedOperationException("Conversion from a mixed type isn't currently supported");
} else if ( vc.getType() == VariantContext.Type.INDEL ) {
if ( a.isNull() ) {
if ( a.isReference() ) {
// ref, where alt is insertion
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
} else {
// non-ref deletion
encoding = new VCFGenotypeEncoding("D" + Integer.toString(referenceBases.length()));
}
} else if ( a.isReference() ) {
// ref, where alt is deletion
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
} else {
// non-ref insertion
encoding = new VCFGenotypeEncoding("I" + alleleString);
}
} else if ( vc.getType() == VariantContext.Type.NO_VARIATION ) {
// ref
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
} else {
// ref or alt for snp
encoding = new VCFGenotypeEncoding(alleleString);
}
if ( a.isNonReference() ) { if ( a.isNonReference() ) {
vcfAltAlleles.add(encoding); vcfAltAlleles.add(encoding);
} }
alleleMap.put(a, encoding);
alleleMap.put(a.toString(), encoding);
} }
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY)); List<String> vcfGenotypeAttributeKeys = new ArrayList<String>(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY));
@ -225,7 +283,7 @@ public class VariantContextAdaptors {
for ( Genotype g : vc.getGenotypesSortedByName() ) { for ( Genotype g : vc.getGenotypesSortedByName() ) {
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy()); List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
for ( Allele a : g.getAlleles() ) { for ( Allele a : g.getAlleles() ) {
encodings.add(alleleMap.get(a)); encodings.add(alleleMap.get(a.toString()));
} }
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED; VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
@ -240,7 +298,6 @@ public class VariantContextAdaptors {
genotypeObjects.add(vcfG); genotypeObjects.add(vcfG);
} }
// info fields // info fields
Map<String, String> infoFields = new HashMap<String, String>(); Map<String, String> infoFields = new HashMap<String, String>();
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) { for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
@ -251,7 +308,7 @@ public class VariantContextAdaptors {
} }
} }
return new VCFRecord(referenceBase, contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects); return new VCFRecord(Character.toString(previousRefBase), contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects);
} }
private static List<String> calcVCFGenotypeKeys(VariantContext vc) { private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
@ -266,4 +323,66 @@ public class VariantContextAdaptors {
keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY); keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
return Utils.sorted(new ArrayList<String>(keys)); return Utils.sorted(new ArrayList<String>(keys));
} }
// --------------------------------------------------------------------------------------------------------------
//
// Plink to VariantContext
//
// --------------------------------------------------------------------------------------------------------------
private static class PlinkRodAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
throw new UnsupportedOperationException("Conversion from Plink to VC requires a reference allele");
}
VariantContext convert(String name, Object input, Allele refAllele) {
PlinkRod plink = (PlinkRod)input;
HashMap<String, Allele> alleles = new HashMap<String, Allele>(); // use String keys to help maintain uniqueness
alleles.put(refAllele.isNull() ? Allele.NULL_ALLELE_STRING : new String(refAllele.getBases()), refAllele);
Set<Genotype> genotypes = new HashSet<Genotype>();
Map<String, List<String>> genotypeSets = plink.getGenotypes();
// for each sample
for ( Map.Entry<String, List<String>> genotype : genotypeSets.entrySet() ) {
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
// for each allele
for ( String alleleString : genotype.getValue() ) {
Allele allele;
if ( alleleString.equals(Allele.NO_CALL_STRING) ) {
allele = Allele.NO_CALL;
} else {
if ( !plink.isIndel() ) {
allele = new Allele(alleleString, refAllele.basesMatch(alleleString));
} else if ( alleleString.equals(Allele.NULL_ALLELE_STRING) ) {
allele = new Allele(alleleString, plink.isInsertion());
} else {
allele = new Allele(alleleString, !plink.isInsertion());
}
if ( !alleles.containsKey(alleleString) )
alleles.put(alleleString, allele);
}
myAlleles.add(allele);
}
// create the genotype
genotypes.add(new Genotype(genotype.getKey(), myAlleles));
}
// create the variant context
try {
GenomeLoc loc = GenomeLocParser.setStop(plink.getLocation(), plink.getLocation().getStop() + plink.getLength()-1);
VariantContext vc = new VariantContext(plink.getVariantName(), loc, alleles.values(), genotypes);
vc.validate();
return vc;
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
}
}
}
} }

View File

@ -5,15 +5,10 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.io.File; import java.io.File;
/** /**
@ -29,7 +24,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false) @Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false)
boolean onlyContextsStartinAtCurrentPosition = false; boolean onlyContextsStartinAtCurrentPosition = false;
@Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false) @Argument(fullName="printPerLocus", doc="If true, we'll psetenv LD_LIBRARY_PATH .:/util/gcc-4.3.0/lib64:/util/gcc-4.3.0/lib/gcc/x86_64-unknown-linux-gnu/4.3.0:/util/gcc-4.3.0/lib/:/util/lib:/broad/tools/Linux/x86_64/pkgs/boost_1.38.0/lib:/humgen/gsa-scr1/GATK_Data/bwarint the variant contexts, in addition to counts", required=false)
boolean printContexts = false; boolean printContexts = false;
@Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false) @Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false)
@ -57,7 +52,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
wroteHeader = true; wroteHeader = true;
} }
writer.addRecord(VariantContextAdaptors.toVCF(vc)); writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
} }
n++; n++;

View File

@ -396,13 +396,13 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
} }
if ( group.enableInterestingSiteCaptures && captureInterestingSitesOfEvalSet(group) ) if ( group.enableInterestingSiteCaptures && captureInterestingSitesOfEvalSet(group) )
writeInterestingSite(interestingReasons, vc); writeInterestingSite(interestingReasons, vc, ref.getBase());
} }
return 0; return 0;
} }
private void writeInterestingSite(List<String> interestingReasons, VariantContext vc) { private void writeInterestingSite(List<String> interestingReasons, VariantContext vc, char ref) {
if ( writer != null && interestingReasons.size() > 0 ) { if ( writer != null && interestingReasons.size() > 0 ) {
MutableVariantContext mvc = new MutableVariantContext(vc); MutableVariantContext mvc = new MutableVariantContext(vc);
@ -431,7 +431,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
wroteHeader = true; wroteHeader = true;
} }
writer.addRecord(VariantContextAdaptors.toVCF(mvc)); writer.addRecord(VariantContextAdaptors.toVCF(mvc, ref));
//interestingReasons.clear(); //interestingReasons.clear();
} }
} }

View File

@ -10,19 +10,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator; import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
/** /**
* Test routine for new VariantContext object * Test routine for new VariantContext object
@ -68,6 +60,9 @@ public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
} }
} }
if ( vc != null )
writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
return vc; return vc;
} }
@ -121,13 +116,6 @@ public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
} }
public Long reduce(VariantContext vc, Long prevReduce) { public Long reduce(VariantContext vc, Long prevReduce) {
if ( vc == null ) { return ( vc == null ? prevReduce : prevReduce+1);
return prevReduce;
} else {
writer.addRecord(VariantContextAdaptors.toVCF(vc));
prevReduce++;
}
return prevReduce;
} }
} }

View File

@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.refdata.PlinkRod;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.genotype.vcf.*;
@ -22,6 +24,7 @@ import java.util.*;
/** /**
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes) * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
*/ */
@Reference(window=@Window(start=0,stop=40))
public class PlinkToVCF extends RodWalker<VCFRecord,Integer> { public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true) @Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true)
public File vcfFile = null; public File vcfFile = null;
@ -39,27 +42,19 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false) @Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false)
public double maxHomNonref = 1.1; public double maxHomNonref = 1.1;
private final Set<String> HEADER_FIELDS = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype", private static final int MAX_INDEL_SIZE = 40;
"FID","IID","PAT","MAT","SEX","PHENOTYPE"));
private final int INIT_NUMBER_OF_POPULATIONS = 10; private final int INIT_NUMBER_OF_POPULATIONS = 10;
private final int DEFAULT_QUALITY = 20;
private ArrayList<String> sampleNames = new ArrayList<String>(nSamples); private ArrayList<String> sampleNames = new ArrayList<String>(nSamples);
private VCFGenotypeWriterAdapter vcfWriter; private VCFWriter vcfWriter = null;
private final HardyWeinberg HWCalc = new HardyWeinberg(); private final HardyWeinberg HWCalc = new HardyWeinberg();
private final boolean useSmartHardy = popFile != null; private final boolean useSmartHardy = popFile != null;
private HashMap<String,String> samplesToPopulation; private HashMap<String,String> samplesToPopulation;
public void initialize() { public void initialize() {
vcfWriter = new VCFGenotypeWriterAdapter(vcfFile);
if ( useSmartHardy ) { if ( useSmartHardy ) {
samplesToPopulation = parsePopulationFile(popFile); samplesToPopulation = parsePopulationFile(popFile);
} }
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
vcfWriter.writeHeader(new TreeSet<String>(sampleNames), hInfo);
nSamples = sampleNames.size();
} }
public Integer reduceInit() { public Integer reduceInit() {
@ -85,50 +80,69 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
if ( plinkRod == null ) if ( plinkRod == null )
return null; return null;
if ( vcfWriter == null )
initializeWriter(plinkRod);
return addVariantInformationToCall(ref, plinkRod); return addVariantInformationToCall(ref, plinkRod);
} }
private void initializeWriter(PlinkRod plinkRod) {
vcfWriter = new VCFWriter(vcfFile);
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
VCFHeader header = new VCFHeader(hInfo, new TreeSet<String>(plinkRod.getSampleNames()));
vcfWriter.writeHeader(header);
nSamples = sampleNames.size();
}
public Integer reduce(VCFRecord call, Integer numVariants) { public Integer reduce(VCFRecord call, Integer numVariants) {
if ( call != null ) { if ( call != null ) {
numVariants++; numVariants++;
printToVCF(call); vcfWriter.addRecord(call);
} }
return numVariants; return numVariants;
} }
public void onTraversalDone(Integer finalReduce) { public void onTraversalDone(Integer finalReduce) {
logger.info("Variants processed="+finalReduce.toString()); logger.info("Variants processed="+finalReduce.toString());
vcfWriter.close(); if ( vcfWriter != null )
vcfWriter.close();
} }
private void printToVCF(VCFRecord call) {
try {
vcfWriter.addRecord(call);
} catch ( RuntimeException e ) {
if ( e.getLocalizedMessage().equalsIgnoreCase("We have more genotype samples than the header specified")) {
throw new StingException("We have more sample genotypes than sample names -- check that there are no duplicates in the .ped file",e);
} else {
throw new StingException("Error in VCF creation: "+e.getLocalizedMessage(),e);
}
}
}
private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) { private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) {
VariantContext vContext = plinkRod.getVariantContext(); // determine the reference allele
VCFRecord record = VariantContextAdaptors.toVCF(vContext); Allele refAllele;
if ( !plinkRod.isIndel() ) {
refAllele = new Allele(Character.toString(ref.getBase()), true);
} else if ( plinkRod.isInsertion() ) {
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
} else {
if ( plinkRod.getLength() > MAX_INDEL_SIZE )
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
char[] deletion = new char[plinkRod.getLength()];
System.arraycopy(ref.getBases(), 1, deletion, 0, plinkRod.getLength());
refAllele = new Allele(new String(deletion), true);
}
VariantContext vContext = VariantContextAdaptors.toVariantContext(plinkRod.getName(), plinkRod, refAllele);
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
record.setGenotypeFormatString("GT"); record.setGenotypeFormatString("GT");
int numNoCalls = vContext.getNoCallCount(); if ( true )
int numHomVarCalls = vContext.getHomVarCount(); return record;
int numHetCalls = vContext.getHetCount(); int numHetCalls = vContext.getHetCount();
double noCallProp = (double)vContext.getNoCallCount() / (double)vContext.getNSamples();
double homVarProp = (double)vContext.getHomVarCount() / (double)vContext.getNSamples();
double noCallProp = ( (double) numNoCalls )/( (double) sampleNames.size());
double homNonRProp = ( (double) numHomVarCalls )/( (double) sampleNames.size() - numNoCalls);
record.setQual(DEFAULT_QUALITY);
String hw = hardyWeinbergCalculation(ref,record); String hw = hardyWeinbergCalculation(ref,record);
double hwScore = hw != null ? Double.valueOf(hw) : -0.0; double hwScore = hw != null ? Double.valueOf(hw) : -0.0;
// TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore)); // TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore));
@ -136,7 +150,7 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
record.setFilterString("Hardy-Weinberg"); record.setFilterString("Hardy-Weinberg");
} else if ( noCallProp > maxNoCall ) { } else if ( noCallProp > maxNoCall ) {
record.setFilterString("No-calls"); record.setFilterString("No-calls");
} else if ( homNonRProp > maxHomNonref) { } else if ( homVarProp > maxHomNonref) {
record.setFilterString("HomNonref-calls"); record.setFilterString("HomNonref-calls");
} }

View File

@ -582,7 +582,10 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
builder.append(EMPTY_ALLELE_FIELD); builder.append(EMPTY_ALLELE_FIELD);
} }
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual())); if ( MathUtils.compareDoubles(mQual, -1.0) == 0 )
builder.append("-1");
else
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual));
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes())); builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
@ -623,12 +626,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @param header the header object * @param header the header object
*/ */
private void addGenotypeData(StringBuilder builder, VCFHeader header) { private void addGenotypeData(StringBuilder builder, VCFHeader header) {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
StringBuffer tempStr = new StringBuffer(); StringBuffer tempStr = new StringBuffer();
if ( header.getGenotypeSamples().size() < getGenotypes().size() ) if ( header.getGenotypeSamples().size() < getGenotypes().size() ) {
throw new IllegalStateException("We have more genotype samples than the header specified"); for ( String sample : gMap.keySet() ) {
if ( !header.getGenotypeSamples().contains(sample) )
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
else
header.getGenotypeSamples().remove(sample);
}
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
}
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString); tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
String[] genotypeFormatStrings = mGenotypeFormatString.split(":"); String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
for ( String genotype : header.getGenotypeSamples() ) { for ( String genotype : header.getGenotypeSamples() ) {

View File

@ -7,7 +7,7 @@ import java.util.Arrays;
public class IndelRealignerIntegrationTest extends WalkerTest { public class IndelRealignerIntegrationTest extends WalkerTest {
@Test @Test
public void testIntervals() { public void testRealigner() {
String[] md5lod5 = {"67c3fc25e9d192cc5fbfd48ade0efc84", "86778f92b0fa6aa7c26e651c8c1eb320"}; String[] md5lod5 = {"67c3fc25e9d192cc5fbfd48ade0efc84", "86778f92b0fa6aa7c26e651c8c1eb320"};
WalkerTestSpec spec1 = new WalkerTestSpec( WalkerTestSpec spec1 = new WalkerTestSpec(
@ -23,4 +23,4 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
Arrays.asList(md5lod200)); Arrays.asList(md5lod200));
executeTest("testLod200", spec2); executeTest("testLod200", spec2);
} }
} }