Various improvements to plink, variant context, and VCF code.
We almost completely support indels. Not yet done with plink stuff. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2926 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c8077b7a22
commit
0dd65461a1
|
|
@ -84,8 +84,10 @@ public class Allele implements Comparable<Allele> {
|
|||
|
||||
private byte[] bases = null;
|
||||
|
||||
public final static String NULL_ALLELE_STRING = "-";
|
||||
public final static String NO_CALL_STRING = ".";
|
||||
/** A generic static NO_CALL allele for use */
|
||||
public final static Allele NO_CALL = new Allele(".");
|
||||
public final static Allele NO_CALL = new Allele(NO_CALL_STRING);
|
||||
|
||||
/**
|
||||
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases
|
||||
|
|
|
|||
|
|
@ -596,6 +596,11 @@ public class VariantContext {
|
|||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @return the number of samples in the context
|
||||
*/
|
||||
public int getNSamples() { return genotypes.size(); }
|
||||
|
||||
/**
|
||||
* @return true if the context has associated genotypes
|
||||
*/
|
||||
|
|
@ -905,7 +910,7 @@ public class VariantContext {
|
|||
for ( Allele a : alleles ) {
|
||||
for ( Allele b : dest ) {
|
||||
if ( a.basesMatch(b) )
|
||||
throw new IllegalArgumentException("Duplicate allele added to VariantContext" + a);
|
||||
throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a);
|
||||
}
|
||||
|
||||
dest.add(a);
|
||||
|
|
@ -915,10 +920,10 @@ public class VariantContext {
|
|||
}
|
||||
|
||||
private static Map<String, Genotype> genotypeCollectionToMap(Map<String, Genotype> dest, Collection<Genotype> genotypes) {
|
||||
for ( Genotype a : genotypes ) {
|
||||
if ( dest.containsKey(a.getSampleName() ) )
|
||||
throw new IllegalArgumentException("Duplicate genotype added to VariantContext " + a);
|
||||
dest.put(a.getSampleName(), a);
|
||||
for ( Genotype g : genotypes ) {
|
||||
if ( dest.containsKey(g.getSampleName() ) )
|
||||
throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g);
|
||||
dest.put(g.getSampleName(), g);
|
||||
}
|
||||
|
||||
return dest;
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ import java.util.*;
|
|||
public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<PlinkRod> {
|
||||
|
||||
public static final String SEQUENOM_NO_CALL = "0";
|
||||
public static final String SEQUENOM_NO_VARIANT = "-";
|
||||
public static final String SEQUENOM_NO_BASE = "-";
|
||||
|
||||
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
|
||||
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
|
||||
|
|
@ -27,6 +27,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
|
||||
private PlinkFileType plinkFileType;
|
||||
|
||||
private List<String> sampleNames;
|
||||
|
||||
public enum PlinkFileType {
|
||||
STANDARD_PED, RAW_PED, BINARY_PED
|
||||
}
|
||||
|
|
@ -35,10 +37,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
super(name);
|
||||
}
|
||||
|
||||
public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter) {
|
||||
public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter, List<String> sampleNames) {
|
||||
super(name);
|
||||
currentVariant = record;
|
||||
variantIterator = iter;
|
||||
this.sampleNames = sampleNames;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -81,7 +84,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
|
||||
// get the next record
|
||||
currentVariant = variantIterator.next();
|
||||
return new PlinkRod(name, currentVariant, variantIterator);
|
||||
return new PlinkRod(name, currentVariant, variantIterator, sampleNames);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -111,9 +114,12 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
|
||||
}
|
||||
|
||||
public VariantContext getVariantContext() {
|
||||
assertNotNull();
|
||||
return currentVariant.getVariantContext();
|
||||
public Map<String, List<String>> getGenotypes() {
|
||||
return currentVariant.getGenotypes();
|
||||
}
|
||||
|
||||
public List<String> getSampleNames() {
|
||||
return sampleNames;
|
||||
}
|
||||
|
||||
public boolean isIndel() {
|
||||
|
|
@ -121,6 +127,16 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
return currentVariant.isIndel();
|
||||
}
|
||||
|
||||
public boolean isInsertion() {
|
||||
assertNotNull();
|
||||
return currentVariant.isInsertion();
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
assertNotNull();
|
||||
return currentVariant.getLength();
|
||||
}
|
||||
|
||||
|
||||
// AM I PARSING A TEXT OR A BINARY FILE ??
|
||||
|
||||
|
|
@ -150,6 +166,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header);
|
||||
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
|
||||
|
||||
sampleNames = new ArrayList<String>();
|
||||
|
||||
String line;
|
||||
do {
|
||||
line = reader.readLine();
|
||||
|
|
@ -180,6 +198,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
|
||||
plinkInfo = plinkLine.split("\t");
|
||||
String individualName = plinkInfo[1];
|
||||
sampleNames.add(individualName);
|
||||
|
||||
int snpNumber = 0;
|
||||
for ( int i : offsets ) {
|
||||
|
|
@ -237,8 +256,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
private ArrayList<PlinkVariantInfo> parseBinaryFormattedPlinkFile(File file) {
|
||||
PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file);
|
||||
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
|
||||
ArrayList<String> sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
|
||||
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile);
|
||||
sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
|
||||
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants, binaryFiles.bedFile);
|
||||
|
||||
java.util.Collections.sort(updatedVariants);
|
||||
|
||||
|
|
@ -294,7 +313,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
return variants;
|
||||
}
|
||||
|
||||
private ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
|
||||
private static ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader( new FileReader( famFile ));
|
||||
|
|
@ -321,7 +340,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
return sampleNames;
|
||||
}
|
||||
|
||||
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, ArrayList<String> samples, File bedFile) {
|
||||
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, File bedFile) {
|
||||
FileInputStream inStream;
|
||||
try {
|
||||
inStream = new FileInputStream(bedFile);
|
||||
|
|
@ -342,11 +361,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
bytesRead++;
|
||||
if ( genotype != -1 ) {
|
||||
if ( bytesRead > 3 ) {
|
||||
addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode);
|
||||
addGenotypeByte(genotype,variants,snpOffset,sampleOffset, snpMajorMode);
|
||||
|
||||
if ( snpMajorMode ) {
|
||||
sampleOffset = sampleOffset + 4;
|
||||
if ( sampleOffset > samples.size() -1 ) {
|
||||
if ( sampleOffset > sampleNames.size() -1 ) {
|
||||
snpOffset ++;
|
||||
sampleOffset = 0;
|
||||
}
|
||||
|
|
@ -372,7 +391,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
|||
return variants;
|
||||
}
|
||||
|
||||
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, ArrayList<String> sampleNames, int snpOffset, int sampleOffset, boolean major) {
|
||||
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, int snpOffset, int sampleOffset, boolean major) {
|
||||
// four genotypes encoded in this byte
|
||||
int[] genotypes = parseGenotypes(genotype);
|
||||
for ( int g : genotypes ) {
|
||||
|
|
@ -406,13 +425,12 @@ class PlinkVariantInfo implements Comparable {
|
|||
|
||||
private String variantName;
|
||||
private GenomeLoc loc;
|
||||
private HashSet<Allele> alleles = new HashSet<Allele>();
|
||||
private HashSet<Genotype> genotypes = new HashSet<Genotype>();
|
||||
private Map<String, List<String>> genotypes = new HashMap<String, List<String>>();
|
||||
|
||||
// for indels
|
||||
private boolean isIndel = false;
|
||||
private boolean isInsertion = false;
|
||||
private int indelLength = 0;
|
||||
private int length = 1;
|
||||
|
||||
// for binary parsing
|
||||
private String locAllele1;
|
||||
|
|
@ -432,18 +450,22 @@ class PlinkVariantInfo implements Comparable {
|
|||
return variantName;
|
||||
}
|
||||
|
||||
public VariantContext getVariantContext() {
|
||||
try {
|
||||
return new VariantContext(variantName, loc, alleles, genotypes);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
|
||||
}
|
||||
public Map<String, List<String>> getGenotypes() {
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
public boolean isIndel() {
|
||||
return isIndel;
|
||||
}
|
||||
|
||||
public boolean isInsertion() {
|
||||
return isInsertion;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public void setGenomeLoc(GenomeLoc loc) {
|
||||
this.loc = loc;
|
||||
}
|
||||
|
|
@ -463,10 +485,17 @@ class PlinkVariantInfo implements Comparable {
|
|||
String pos = pieces[1].substring(1);
|
||||
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
|
||||
|
||||
if ( pieces.length > 2 && (pieces[2].equals("gI") || pieces[2].equals("gD")) ) {
|
||||
if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {
|
||||
// it's an indel
|
||||
isIndel = true;
|
||||
isInsertion = pieces[2].equals("gI");
|
||||
isInsertion = pieces[2].startsWith("gI");
|
||||
try {
|
||||
// length of insertion on reference is still 1
|
||||
if ( !isInsertion )
|
||||
length = Integer.parseInt(pieces[2].substring(2));
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -482,32 +511,16 @@ class PlinkVariantInfo implements Comparable {
|
|||
|
||||
public void addGenotypeEntry(String[] alleleStrings, String sampleName) {
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
|
||||
ArrayList<String> alleles = new ArrayList<String>(2);
|
||||
|
||||
for ( String alleleString : alleleStrings ) {
|
||||
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) ) {
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
} else {
|
||||
Allele allele;
|
||||
if ( !isIndel ) {
|
||||
allele = new Allele(alleleString);
|
||||
} else if ( alleleString.equals(PlinkRod.SEQUENOM_NO_VARIANT) ) {
|
||||
allele = new Allele(alleleString, isInsertion);
|
||||
} else {
|
||||
allele = new Allele(alleleString, !isInsertion);
|
||||
if ( indelLength == 0 ) {
|
||||
indelLength = alleleString.length();
|
||||
loc = GenomeLocParser.parseGenomeLoc(loc.getContig(), loc.getStart(), loc.getStart()+indelLength-1);
|
||||
}
|
||||
}
|
||||
|
||||
myAlleles.add(allele);
|
||||
if ( alleles.size() < 2 )
|
||||
alleles.add(allele);
|
||||
}
|
||||
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) )
|
||||
alleles.add(Allele.NO_CALL_STRING);
|
||||
else
|
||||
alleles.add(alleleString);
|
||||
}
|
||||
|
||||
genotypes.add(new Genotype(sampleName, myAlleles, 20.0));
|
||||
genotypes.put(sampleName, alleles);
|
||||
}
|
||||
|
||||
public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.refdata;
|
|||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
|
|
@ -36,6 +38,7 @@ public class VariantContextAdaptors {
|
|||
adaptors.put(rodDbSNP.class, new RodDBSnpAdaptor());
|
||||
adaptors.put(RodVCF.class, new RodVCFAdaptor());
|
||||
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
|
||||
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
|
||||
}
|
||||
|
||||
public static boolean canBeConvertedToVariantContext(Object variantContainingObject) {
|
||||
|
|
@ -46,6 +49,7 @@ public class VariantContextAdaptors {
|
|||
/** generic superclass */
|
||||
private static abstract class VCAdaptor {
|
||||
abstract VariantContext convert(String name, Object input);
|
||||
abstract VariantContext convert(String name, Object input, Allele refAllele);
|
||||
}
|
||||
|
||||
public static VariantContext toVariantContext(String name, Object variantContainingObject) {
|
||||
|
|
@ -56,9 +60,17 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
}
|
||||
|
||||
public static VariantContext toVariantContext(String name, Object variantContainingObject, Allele refAllele) {
|
||||
if ( ! adaptors.containsKey(variantContainingObject.getClass()) )
|
||||
return null;
|
||||
else {
|
||||
return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, refAllele);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// From here below you can add adaptor classes for new rods (or other types) to convert to VCF
|
||||
// From here below you can add adaptor classes for new rods (or other types) to convert to VC
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
|
@ -72,16 +84,17 @@ public class VariantContextAdaptors {
|
|||
|
||||
private static class RodDBSnpAdaptor extends VCAdaptor {
|
||||
VariantContext convert(String name, Object input) {
|
||||
if ( ! Allele.acceptableAlleleBases(((rodDbSNP)input).getReference()) )
|
||||
return null;
|
||||
Allele refAllele = new Allele(((rodDbSNP)input).getReference(), true);
|
||||
return convert(name, input, refAllele);
|
||||
}
|
||||
|
||||
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||
rodDbSNP dbsnp = (rodDbSNP)input;
|
||||
if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) {
|
||||
// add the reference allele
|
||||
if ( ! Allele.acceptableAlleleBases(dbsnp.getReference()) ) {
|
||||
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
|
||||
return null;
|
||||
}
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Allele refAllele = new Allele(dbsnp.getReference(), true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
// add all of the alt alleles
|
||||
|
|
@ -112,18 +125,28 @@ public class VariantContextAdaptors {
|
|||
|
||||
private static class RodVCFAdaptor extends VCAdaptor {
|
||||
VariantContext convert(String name, Object input) {
|
||||
return vcfToVariantContext(name, ((RodVCF)input).getRecord());
|
||||
Allele refAllele = new Allele(((RodVCF)input).getReference(), true);
|
||||
return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele);
|
||||
}
|
||||
|
||||
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||
return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele);
|
||||
}
|
||||
}
|
||||
|
||||
private static class VCFRecordAdaptor extends VCAdaptor {
|
||||
VariantContext convert(String name, Object input) {
|
||||
return vcfToVariantContext(name, (VCFRecord)input);
|
||||
Allele refAllele = new Allele(((VCFRecord)input).getReference(), true);
|
||||
return vcfToVariantContext(name, (VCFRecord)input, refAllele);
|
||||
}
|
||||
|
||||
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||
return vcfToVariantContext(name, (VCFRecord)input, refAllele);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf) {
|
||||
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, Allele refAllele) {
|
||||
if ( vcf.isSNP() || vcf.isIndel() ) {
|
||||
// add the reference allele
|
||||
if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) {
|
||||
|
|
@ -137,7 +160,6 @@ public class VariantContextAdaptors {
|
|||
|
||||
// add all of the alt alleles
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Allele refAllele = new Allele(vcf.getReference(), true);
|
||||
alleles.add(refAllele);
|
||||
for ( String alt : vcf.getAlternateAlleleList() ) {
|
||||
if ( ! Allele.acceptableAlleleBases(alt) ) {
|
||||
|
|
@ -194,9 +216,9 @@ public class VariantContextAdaptors {
|
|||
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
|
||||
}
|
||||
|
||||
public static VCFRecord toVCF(VariantContext vc) {
|
||||
public static VCFRecord toVCF(VariantContext vc, char previousRefBase) {
|
||||
// deal with the reference
|
||||
String referenceBase = new String(vc.getReference().getBases());
|
||||
String referenceBases = new String(vc.getReference().getBases());
|
||||
|
||||
String contig = vc.getLocation().getContig();
|
||||
long position = vc.getLocation().getStart();
|
||||
|
|
@ -205,15 +227,51 @@ public class VariantContextAdaptors {
|
|||
|
||||
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS;
|
||||
|
||||
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
|
||||
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
|
||||
// TODO - fixme: temporarily using Strings as keys because Alleles aren't hashing correctly
|
||||
Map<String, VCFGenotypeEncoding> alleleMap = new HashMap<String, VCFGenotypeEncoding>();
|
||||
alleleMap.put(Allele.NO_CALL.toString(), new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
|
||||
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
for ( Allele a : vc.getAlleles() ) {
|
||||
VCFGenotypeEncoding encoding = new VCFGenotypeEncoding(new String(a.getBases()));
|
||||
|
||||
VCFGenotypeEncoding encoding;
|
||||
|
||||
// This is tricky because the VCF spec states that the reference must be a single
|
||||
// character, whereas the reference alleles for deletions of length > 1 are strings.
|
||||
// To be safe, we require the reference base be passed in and we use that whenever
|
||||
// we know that the given allele is the reference.
|
||||
|
||||
String alleleString = new String(a.getBases());
|
||||
if ( vc.getType() == VariantContext.Type.MIXED ) {
|
||||
throw new UnsupportedOperationException("Conversion from a mixed type isn't currently supported");
|
||||
} else if ( vc.getType() == VariantContext.Type.INDEL ) {
|
||||
if ( a.isNull() ) {
|
||||
if ( a.isReference() ) {
|
||||
// ref, where alt is insertion
|
||||
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
|
||||
} else {
|
||||
// non-ref deletion
|
||||
encoding = new VCFGenotypeEncoding("D" + Integer.toString(referenceBases.length()));
|
||||
}
|
||||
} else if ( a.isReference() ) {
|
||||
// ref, where alt is deletion
|
||||
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
|
||||
} else {
|
||||
// non-ref insertion
|
||||
encoding = new VCFGenotypeEncoding("I" + alleleString);
|
||||
}
|
||||
} else if ( vc.getType() == VariantContext.Type.NO_VARIATION ) {
|
||||
// ref
|
||||
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
|
||||
} else {
|
||||
// ref or alt for snp
|
||||
encoding = new VCFGenotypeEncoding(alleleString);
|
||||
}
|
||||
|
||||
if ( a.isNonReference() ) {
|
||||
vcfAltAlleles.add(encoding);
|
||||
}
|
||||
alleleMap.put(a, encoding);
|
||||
|
||||
alleleMap.put(a.toString(), encoding);
|
||||
}
|
||||
|
||||
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY));
|
||||
|
|
@ -225,7 +283,7 @@ public class VariantContextAdaptors {
|
|||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
||||
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
|
||||
for ( Allele a : g.getAlleles() ) {
|
||||
encodings.add(alleleMap.get(a));
|
||||
encodings.add(alleleMap.get(a.toString()));
|
||||
}
|
||||
|
||||
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
|
||||
|
|
@ -240,7 +298,6 @@ public class VariantContextAdaptors {
|
|||
|
||||
genotypeObjects.add(vcfG);
|
||||
}
|
||||
|
||||
// info fields
|
||||
Map<String, String> infoFields = new HashMap<String, String>();
|
||||
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
|
||||
|
|
@ -251,7 +308,7 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
}
|
||||
|
||||
return new VCFRecord(referenceBase, contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects);
|
||||
return new VCFRecord(Character.toString(previousRefBase), contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects);
|
||||
}
|
||||
|
||||
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
|
||||
|
|
@ -266,4 +323,66 @@ public class VariantContextAdaptors {
|
|||
keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
|
||||
return Utils.sorted(new ArrayList<String>(keys));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Plink to VariantContext
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private static class PlinkRodAdaptor extends VCAdaptor {
|
||||
VariantContext convert(String name, Object input) {
|
||||
throw new UnsupportedOperationException("Conversion from Plink to VC requires a reference allele");
|
||||
}
|
||||
|
||||
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||
PlinkRod plink = (PlinkRod)input;
|
||||
|
||||
HashMap<String, Allele> alleles = new HashMap<String, Allele>(); // use String keys to help maintain uniqueness
|
||||
alleles.put(refAllele.isNull() ? Allele.NULL_ALLELE_STRING : new String(refAllele.getBases()), refAllele);
|
||||
|
||||
Set<Genotype> genotypes = new HashSet<Genotype>();
|
||||
|
||||
Map<String, List<String>> genotypeSets = plink.getGenotypes();
|
||||
// for each sample
|
||||
for ( Map.Entry<String, List<String>> genotype : genotypeSets.entrySet() ) {
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
|
||||
|
||||
// for each allele
|
||||
for ( String alleleString : genotype.getValue() ) {
|
||||
Allele allele;
|
||||
if ( alleleString.equals(Allele.NO_CALL_STRING) ) {
|
||||
allele = Allele.NO_CALL;
|
||||
} else {
|
||||
if ( !plink.isIndel() ) {
|
||||
allele = new Allele(alleleString, refAllele.basesMatch(alleleString));
|
||||
} else if ( alleleString.equals(Allele.NULL_ALLELE_STRING) ) {
|
||||
allele = new Allele(alleleString, plink.isInsertion());
|
||||
} else {
|
||||
allele = new Allele(alleleString, !plink.isInsertion());
|
||||
}
|
||||
|
||||
if ( !alleles.containsKey(alleleString) )
|
||||
alleles.put(alleleString, allele);
|
||||
}
|
||||
|
||||
myAlleles.add(allele);
|
||||
}
|
||||
|
||||
// create the genotype
|
||||
genotypes.add(new Genotype(genotype.getKey(), myAlleles));
|
||||
}
|
||||
|
||||
// create the variant context
|
||||
try {
|
||||
GenomeLoc loc = GenomeLocParser.setStop(plink.getLocation(), plink.getLocation().getStop() + plink.getLength()-1);
|
||||
VariantContext vc = new VariantContext(plink.getVariantName(), loc, alleles.values(), genotypes);
|
||||
vc.validate();
|
||||
return vc;
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,15 +5,10 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
|
|
@ -29,7 +24,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false)
|
||||
boolean onlyContextsStartinAtCurrentPosition = false;
|
||||
|
||||
@Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false)
|
||||
@Argument(fullName="printPerLocus", doc="If true, we'll psetenv LD_LIBRARY_PATH .:/util/gcc-4.3.0/lib64:/util/gcc-4.3.0/lib/gcc/x86_64-unknown-linux-gnu/4.3.0:/util/gcc-4.3.0/lib/:/util/lib:/broad/tools/Linux/x86_64/pkgs/boost_1.38.0/lib:/humgen/gsa-scr1/GATK_Data/bwarint the variant contexts, in addition to counts", required=false)
|
||||
boolean printContexts = false;
|
||||
|
||||
@Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false)
|
||||
|
|
@ -57,7 +52,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
|
|||
wroteHeader = true;
|
||||
}
|
||||
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(vc));
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
|
||||
}
|
||||
|
||||
n++;
|
||||
|
|
|
|||
|
|
@ -396,13 +396,13 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
if ( group.enableInterestingSiteCaptures && captureInterestingSitesOfEvalSet(group) )
|
||||
writeInterestingSite(interestingReasons, vc);
|
||||
writeInterestingSite(interestingReasons, vc, ref.getBase());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void writeInterestingSite(List<String> interestingReasons, VariantContext vc) {
|
||||
private void writeInterestingSite(List<String> interestingReasons, VariantContext vc, char ref) {
|
||||
if ( writer != null && interestingReasons.size() > 0 ) {
|
||||
MutableVariantContext mvc = new MutableVariantContext(vc);
|
||||
|
||||
|
|
@ -431,7 +431,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
|
|||
wroteHeader = true;
|
||||
}
|
||||
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(mvc));
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(mvc, ref));
|
||||
//interestingReasons.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,19 +10,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Test routine for new VariantContext object
|
||||
|
|
@ -68,6 +60,9 @@ public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
|
|||
}
|
||||
}
|
||||
|
||||
if ( vc != null )
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
|
|
@ -121,13 +116,6 @@ public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
|
|||
}
|
||||
|
||||
public Long reduce(VariantContext vc, Long prevReduce) {
|
||||
if ( vc == null ) {
|
||||
return prevReduce;
|
||||
} else {
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(vc));
|
||||
prevReduce++;
|
||||
}
|
||||
|
||||
return prevReduce;
|
||||
return ( vc == null ? prevReduce : prevReduce+1);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.refdata.PlinkRod;
|
|||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
|
|
@ -22,6 +24,7 @@ import java.util.*;
|
|||
/**
|
||||
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
|
||||
*/
|
||||
@Reference(window=@Window(start=0,stop=40))
|
||||
public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
|
||||
@Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true)
|
||||
public File vcfFile = null;
|
||||
|
|
@ -39,27 +42,19 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
|
|||
@Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false)
|
||||
public double maxHomNonref = 1.1;
|
||||
|
||||
private final Set<String> HEADER_FIELDS = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype",
|
||||
"FID","IID","PAT","MAT","SEX","PHENOTYPE"));
|
||||
private static final int MAX_INDEL_SIZE = 40;
|
||||
|
||||
private final int INIT_NUMBER_OF_POPULATIONS = 10;
|
||||
private final int DEFAULT_QUALITY = 20;
|
||||
private ArrayList<String> sampleNames = new ArrayList<String>(nSamples);
|
||||
private VCFGenotypeWriterAdapter vcfWriter;
|
||||
private VCFWriter vcfWriter = null;
|
||||
private final HardyWeinberg HWCalc = new HardyWeinberg();
|
||||
private final boolean useSmartHardy = popFile != null;
|
||||
private HashMap<String,String> samplesToPopulation;
|
||||
|
||||
public void initialize() {
|
||||
vcfWriter = new VCFGenotypeWriterAdapter(vcfFile);
|
||||
if ( useSmartHardy ) {
|
||||
samplesToPopulation = parsePopulationFile(popFile);
|
||||
}
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
|
||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
vcfWriter.writeHeader(new TreeSet<String>(sampleNames), hInfo);
|
||||
nSamples = sampleNames.size();
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -85,50 +80,69 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
|
|||
if ( plinkRod == null )
|
||||
return null;
|
||||
|
||||
if ( vcfWriter == null )
|
||||
initializeWriter(plinkRod);
|
||||
|
||||
return addVariantInformationToCall(ref, plinkRod);
|
||||
}
|
||||
|
||||
private void initializeWriter(PlinkRod plinkRod) {
|
||||
vcfWriter = new VCFWriter(vcfFile);
|
||||
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
|
||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
VCFHeader header = new VCFHeader(hInfo, new TreeSet<String>(plinkRod.getSampleNames()));
|
||||
vcfWriter.writeHeader(header);
|
||||
|
||||
nSamples = sampleNames.size();
|
||||
|
||||
}
|
||||
|
||||
public Integer reduce(VCFRecord call, Integer numVariants) {
|
||||
if ( call != null ) {
|
||||
numVariants++;
|
||||
printToVCF(call);
|
||||
vcfWriter.addRecord(call);
|
||||
}
|
||||
return numVariants;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer finalReduce) {
|
||||
logger.info("Variants processed="+finalReduce.toString());
|
||||
vcfWriter.close();
|
||||
if ( vcfWriter != null )
|
||||
vcfWriter.close();
|
||||
}
|
||||
|
||||
private void printToVCF(VCFRecord call) {
|
||||
try {
|
||||
vcfWriter.addRecord(call);
|
||||
} catch ( RuntimeException e ) {
|
||||
if ( e.getLocalizedMessage().equalsIgnoreCase("We have more genotype samples than the header specified")) {
|
||||
throw new StingException("We have more sample genotypes than sample names -- check that there are no duplicates in the .ped file",e);
|
||||
} else {
|
||||
throw new StingException("Error in VCF creation: "+e.getLocalizedMessage(),e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) {
|
||||
|
||||
VariantContext vContext = plinkRod.getVariantContext();
|
||||
VCFRecord record = VariantContextAdaptors.toVCF(vContext);
|
||||
// determine the reference allele
|
||||
Allele refAllele;
|
||||
if ( !plinkRod.isIndel() ) {
|
||||
refAllele = new Allele(Character.toString(ref.getBase()), true);
|
||||
} else if ( plinkRod.isInsertion() ) {
|
||||
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
|
||||
} else {
|
||||
if ( plinkRod.getLength() > MAX_INDEL_SIZE )
|
||||
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
|
||||
char[] deletion = new char[plinkRod.getLength()];
|
||||
System.arraycopy(ref.getBases(), 1, deletion, 0, plinkRod.getLength());
|
||||
refAllele = new Allele(new String(deletion), true);
|
||||
}
|
||||
|
||||
VariantContext vContext = VariantContextAdaptors.toVariantContext(plinkRod.getName(), plinkRod, refAllele);
|
||||
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
|
||||
record.setGenotypeFormatString("GT");
|
||||
|
||||
int numNoCalls = vContext.getNoCallCount();
|
||||
int numHomVarCalls = vContext.getHomVarCount();
|
||||
if ( true )
|
||||
return record;
|
||||
|
||||
int numHetCalls = vContext.getHetCount();
|
||||
|
||||
double noCallProp = (double)vContext.getNoCallCount() / (double)vContext.getNSamples();
|
||||
double homVarProp = (double)vContext.getHomVarCount() / (double)vContext.getNSamples();
|
||||
|
||||
|
||||
double noCallProp = ( (double) numNoCalls )/( (double) sampleNames.size());
|
||||
double homNonRProp = ( (double) numHomVarCalls )/( (double) sampleNames.size() - numNoCalls);
|
||||
|
||||
record.setQual(DEFAULT_QUALITY);
|
||||
String hw = hardyWeinbergCalculation(ref,record);
|
||||
double hwScore = hw != null ? Double.valueOf(hw) : -0.0;
|
||||
// TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore));
|
||||
|
|
@ -136,7 +150,7 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
|
|||
record.setFilterString("Hardy-Weinberg");
|
||||
} else if ( noCallProp > maxNoCall ) {
|
||||
record.setFilterString("No-calls");
|
||||
} else if ( homNonRProp > maxHomNonref) {
|
||||
} else if ( homVarProp > maxHomNonref) {
|
||||
record.setFilterString("HomNonref-calls");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -582,7 +582,10 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
builder.append(EMPTY_ALLELE_FIELD);
|
||||
}
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual()));
|
||||
if ( MathUtils.compareDoubles(mQual, -1.0) == 0 )
|
||||
builder.append("-1");
|
||||
else
|
||||
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual));
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
|
|
@ -623,12 +626,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
* @param header the header object
|
||||
*/
|
||||
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
|
||||
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
|
||||
|
||||
StringBuffer tempStr = new StringBuffer();
|
||||
if ( header.getGenotypeSamples().size() < getGenotypes().size() )
|
||||
throw new IllegalStateException("We have more genotype samples than the header specified");
|
||||
if ( header.getGenotypeSamples().size() < getGenotypes().size() ) {
|
||||
for ( String sample : gMap.keySet() ) {
|
||||
if ( !header.getGenotypeSamples().contains(sample) )
|
||||
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
|
||||
else
|
||||
header.getGenotypeSamples().remove(sample);
|
||||
}
|
||||
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
|
||||
}
|
||||
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
|
||||
|
||||
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
|
||||
String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
|
||||
|
||||
for ( String genotype : header.getGenotypeSamples() ) {
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import java.util.Arrays;
|
|||
|
||||
public class IndelRealignerIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void testIntervals() {
|
||||
public void testRealigner() {
|
||||
|
||||
String[] md5lod5 = {"67c3fc25e9d192cc5fbfd48ade0efc84", "86778f92b0fa6aa7c26e651c8c1eb320"};
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
|
|
@ -23,4 +23,4 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(md5lod200));
|
||||
executeTest("testLod200", spec2);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue