From c231547204f09fd5dfc6aa0fcd16fe1e07c5cce8 Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 26 Jan 2010 13:53:29 +0000 Subject: [PATCH] Refactoring and migration of new allele/variantcontext/genotype code into oneoffprojects. NOT FOR USE. PlinkRod commented out due to dependence on this new, rapidly changing interface. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2687 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/contexts/VariantContext.java | 315 ---- .../sting/gatk/refdata/Allele.java | 81 - .../sting/gatk/refdata/PlinkRod.java | 1342 ++++++++--------- .../sting/gatk/walkers/PileupWalker.java | 8 +- .../oneoffprojects/variantcontext/Allele.java | 126 ++ .../variantcontext}/Genotype.java | 37 +- .../variantcontext/VariantContext.java | 440 ++++++ .../variantcontext/VariantContextUtils.java | 104 ++ .../sting/gatk/refdata/PlinkRodTest.java | 486 +++--- 9 files changed, 1604 insertions(+), 1335 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/contexts/VariantContext.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/Allele.java create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java rename java/src/org/broadinstitute/sting/{gatk/refdata => oneoffprojects/variantcontext}/Genotype.java (82%) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/VariantContext.java deleted file mode 100755 index 4f832b8ae..000000000 --- a/java/src/org/broadinstitute/sting/gatk/contexts/VariantContext.java +++ /dev/null @@ -1,315 +0,0 @@ -package org.broadinstitute.sting.gatk.contexts; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.gatk.refdata.*; - -import java.util.*; -import org.apache.commons.jexl.*; - - -/** - * @author ebanks - *

- * Class VariantContext - *

- * This class represents a context that unifies one or more variants - */ -public class VariantContext { - - private static final String UNIQUIFIED_SUFFIX = ".unique"; - - private Set alleles; - - private Set genotypes; - - private Allele reference; - - private GenomeLoc loc; - - private HashMap attributes; - - - public VariantContext(VariationRod rod) { - - // TODO -- VariationRod should eventually require classes to implement toVariationContext() - // TODO -- (instead of using a temporary adapter class) - - loc = rod.getLocation(); - reference = new Allele(Allele.AlleleType.REFERENCE, rod.getReference()); - - // TODO -- populate the alleles and genotypes through an adapter - alleles = new HashSet(); - genotypes = new HashSet(); - - attributes = new HashMap(); - } - - protected VariantContext(GenomeLoc loc, Allele reference, Set alleles, Set genotypes, HashMap attributes) { - this.loc = loc; - this.reference = reference; - this.alleles = new HashSet(alleles); - this.genotypes = new HashSet(genotypes); - this.attributes = new HashMap(attributes); - } - - /** - * @param other another variant context - * - * throws an exception if there is a collision such that the same sample exists in both contexts - * @return a context representing the merge of this context and the other provided context - */ - public VariantContext merge(VariantContext other) { - return merge(other, false); - } - - /** - * @param other another variant context - * @param uniquifySamples if true and there is a collision such that the same sample exists in both contexts, - * the samples will be uniquified(based on their sources); - * otherwise, an exception will be thrown - * - * @return a context representing the merge of this context and the other provided context - */ - public VariantContext merge(VariantContext other, boolean uniquifySamples) { - if ( !loc.equals(other.getLocation()) ) - throw new IllegalArgumentException("The locations must be identical for two contexts to be merged"); - - Set samples = getSampleNames(); - Set Gs = new HashSet(genotypes); - - for ( Genotype g : other.getGenotypes() ) { - if ( samples.contains(g.getSample()) ) { - if ( uniquifySamples ) - g.setSample(g.getSample() + UNIQUIFIED_SUFFIX); - else - throw new IllegalStateException("The same sample name exists in both contexts when attempting to merge"); - } - Gs.add(g); - } - - HashMap attrs = new HashMap(attributes); - attrs.putAll(other.getAttributes()); - - return createNewContext(Gs, attrs); - } - - /** - * @return the location of this context - */ - public GenomeLoc getLocation() { return loc; } - - /** - * @return the reference allele for this context - */ - public Allele getReference() { return reference; } - - /** - * @return true if the context is variant (i.e. contains a non-reference allele) - */ - public boolean isVariant() { - for ( Allele allele : alleles ) { - if ( allele.isVariant() ) - return true; - } - return false; - } - - /** - * @return true if the context is strictly bi-allelic - */ - public boolean isBiallelic() { - return getAlternateAlleles().size() == 1; - } - - /** - * @return true if the context represents point alleles only (i.e. no indels or structural variants) - */ - public boolean isPointAllele() { - for ( Allele allele : alleles ) { - if ( allele.isVariant() && !allele.isSNP() ) - return false; - } - return true; - } - - /** - * @return the set of all sample names in this context - */ - public Set getSampleNames() { - Set samples = new TreeSet(); - for ( Genotype g : genotypes ) - samples.add(g.getSample()); - return samples; - } - - /** - * @return true if the context represents variants with associated genotypes - */ - public boolean hasGenotypes() { return genotypes.size() > 0; } - - /** - * @return set of all Genotypes associated with this context - */ - public Set getGenotypes() { return genotypes; } - - /** - * @param sample the sample name - * - * @return the Genotype associated with the given sample in this context or null if the sample is not in this context - */ - public Genotype getGenotype(String sample) { - for ( Genotype g : genotypes ) { - if ( g.getSample().equals(sample) ) - return g; - } - return null; - } - - /** - * @return set of all subclasses within this context - */ - public Set getSubclasses() { - Set subclasses = new HashSet(); - for ( Genotype g : genotypes ) - subclasses.addAll(g.getAttributes().keySet()); - return subclasses; - } - - /** - * @param subclass the name of a subclass of variants to select - * - * @return a subset of this context which selects based on the given subclass - */ - public VariantContext select(String subclass) { - HashSet Gs = new HashSet(); - for ( Genotype g : genotypes ) { - if ( g.getAttribute(subclass) != null ) - Gs.add(g); - } - return createNewContext(Gs, attributes); - } - - /** - * @param expr a jexl expression describing how to filter this context - * - * @return a subset of this context which is filtered based on the given expression - */ - public VariantContext filter(String expr) { - HashSet Gs = new HashSet(); - try { - Expression filterExpression = ExpressionFactory.createExpression(expr); - - for ( Genotype g : genotypes ) { - JexlContext jContext = JexlHelper.createContext(); - jContext.setVars(g.getAttributes()); - if ( (Boolean)filterExpression.evaluate(jContext) ) - Gs.add(g); - } - - } catch (Exception e) { - throw new StingException("JEXL error in VariantContext: " + e.getMessage()); - } - - return createNewContext(Gs, attributes); - } - - /** - * @return a set of new variant contexts, one for each sample from this context - */ - public Set splitBySample() { - Set contexts = new HashSet(); - for ( Genotype g : genotypes ) { - HashSet gAsSet = new HashSet(); - gAsSet.add(g); - contexts.add(createNewContext(gAsSet, attributes)); - } - return contexts; - } - - /** - * @param Gs the set of genotypes from which to create a new context - * @param attrs the attributes for the new context - * - * @return a new context based on the given genotypes - */ - private VariantContext createNewContext(Set Gs, HashMap attrs) { - HashSet As = new HashSet(); - for ( Genotype g : Gs ) - As.addAll(g.getAlleles()); - return new VariantContext(loc, reference, As, Gs, attrs); - } - - /** - * @param allele the allele to be queried - * - * @return the frequency of the given allele in this context - */ - public double getAlleleFrequency(Allele allele) { - - int alleleCount = 0; - int totalCount = 0; - - for ( Genotype g : genotypes ) { - for ( Allele a : g.getAlleles() ) { - totalCount++; - if ( allele.equals(a) ) - alleleCount++; - } - } - - return totalCount == 0 ? 0.0 : (double)alleleCount / (double)totalCount; - } - - /** - * Gets the alleles. This method should return all of the alleles present at the location, - * including the reference allele. There are no constraints imposed on the ordering of alleles - * in the set. If the reference is not an allele in this context it will not be included. - * - * @return the set of alleles - */ - public Set getAlleles() { return alleles; } - - /** - * Gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference allele. There are no constraints imposed on the ordering of alleles - * in the set. - * - * @return the set of alternate alleles - */ - public Set getAlternateAlleles() { - HashSet altAlleles = new HashSet(); - for ( Allele allele : alleles ) { - if ( !allele.equals(reference) ) - altAlleles.add(allele); - } - return altAlleles; - } - - /** - * Sets the given attribute - * - * @param key the attribute key - * @param value the attribute value - */ - public void setAttribute(Object key, Object value) { - attributes.put(key, value); - } - - /** - * @param key the attribute key - * - * @return the attribute value for the given key (or null if not set) - */ - public Object getAttribute(Object key) { - return attributes.get(key); - } - - /** - * @return the attribute map - */ - public Map getAttributes() { - return attributes; - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/Allele.java b/java/src/org/broadinstitute/sting/gatk/refdata/Allele.java deleted file mode 100755 index ebdc4382d..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/Allele.java +++ /dev/null @@ -1,81 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - - -/** - * @author ebanks - *

- * Class Allele - *

- * This class emcompasses all the basic information about an allele - */ -public class Allele { - - private AlleleType type; - - private String bases; - - - // the types of variants we currently allow - public enum AlleleType { - REFERENCE, SNP, INSERTION, DELETION, INVERSION, UNKNOWN_POINT_ALLELE, DELETION_REFERENCE - } - - public Allele(AlleleType type, String bases) { - this.type = type; - if ( bases == null ) - throw new IllegalArgumentException("Constructor: the Allele base string cannot be null"); - if ( type == AlleleType.DELETION && bases.length() > 0 ) - throw new IllegalArgumentException("Constructor: deletions cannot have observed bases"); - if ( (type == AlleleType.REFERENCE || type == AlleleType.SNP || type == AlleleType.UNKNOWN_POINT_ALLELE) && bases.length() > 1 ) - throw new IllegalArgumentException("Constructor: point alleles cannot have more than one observed base"); - this.bases = bases.toUpperCase(); - } - - /** - * convenience method for switching over the allele type - * - * @return the AlleleType of this allele - **/ - public AlleleType getType() { return type; } - - /** - * convenience method for SNPs - * - * @return true if this is a SNP, false otherwise - */ - public boolean isSNP() { return type == AlleleType.SNP; } - - /** - * convenience method for variants - * - * @return true if this is a variant allele, false if it's reference - */ - public boolean isVariant() { return type != AlleleType.REFERENCE; } - - - /** - * convenience method for indels - * - * @return true if this is an indel, false otherwise - */ - public boolean isIndel() { return type == AlleleType.INSERTION || type == AlleleType.DELETION; } - - - /** - * For deletions, this method returns an empty String. - * For everything else, observed bases for the allele are returned. - * - * @return the bases, as a string - */ - public String getBases() { return bases; } - - /** - * @param other the other allele - * - * @return true if these alleles are equal - */ - public boolean equals(Allele other) { - return type == other.getType() && bases.equals(other.getBases()); - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java index 589f02a1c..506087adf 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java @@ -1,671 +1,671 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.CommandLineGATK; - -import java.io.*; -import java.util.*; - -import net.sf.samtools.SAMFileHeader; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Jan 19, 2010 - * Time: 10:24:18 AM - * To change this template use File | Settings | File Templates. - */ -public class PlinkRod extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { - private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", - "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); - private final byte SNP_MAJOR_MODE = 1; - - private ArrayList variants; - private PlinkVariantInfo currentVariant; - private ListIterator variantIterator; - - private PlinkFileType plinkFileType; - - public enum PlinkFileType { - STANDARD_PED,RAW_PED,BINARY_PED - } - - // // // CONSTRUCTOR // // // - - public PlinkRod(String name) { - super(name); - } - - @Override - public Object initialize(final File plinkFile) throws FileNotFoundException { - if ( ! plinkFile.exists() ) { - throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist."); - } - - variants = parsePlinkFile(plinkFile); - if ( variants != null ) { - variantIterator = variants.listIterator(); - currentVariant = variantIterator.next(); - } - - assertNotNull(); - - return null; - } - - private void assertNotNull() { - if ( currentVariant == null ) { - throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" ); - } - } - - @Override - public boolean parseLine(Object obj, String[] args) { - if ( variantIterator.hasNext() ) { - currentVariant = variantIterator.next(); - return true; - } else { - return false; - } - } - - @Override - public GenomeLoc getLocation() { - return currentVariant.getLocation(); - } - - @Override - public String toString() { - return currentVariant == null ? "" : currentVariant.toString(); - } - - public String getVariantName() { - return currentVariant.getName(); - } - - public ArrayList getVariantSampleNames() { - return currentVariant.getSampleNames(); - } - - public ArrayList getGenotypes() { - return currentVariant.getGenotypes(); - } - - public boolean variantIsSNP() { - return currentVariant.isSNP(); - } - - - // AM I PARSING A TEXT OR A BINARY FILE ?? - - private ArrayList parsePlinkFile(File file) { - String[] splitFileName = file.getName().split("\\."); - String extension = splitFileName[splitFileName.length-1]; - if ( extension.equals("ped") || extension.equals("raw") ) { - return parseTextFormattedPlinkFile(file); - } else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) { - plinkFileType = PlinkFileType.BINARY_PED; - return parseBinaryFormattedPlinkFile(file); - } else { - System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format"); - return parseTextFormattedPlinkFile(file); - } - - } - - /* *** *** *** *** *** ** *** ** *** ** *** ** *** ** *** - * * PARSING STANDARD TEXT PED FILES * ** - * *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/ - - private ArrayList parseTextFormattedPlinkFile( File file ) { - try { - BufferedReader reader = new BufferedReader( new FileReader ( file ) ); - String header = reader.readLine(); - ArrayList seqVars = instantiateVariantListFromHeader(header); - ArrayList snpOffsets = getSNPOffsetsFromHeader(header); - - String line = null; - do { - line = reader.readLine(); - incorporateInfo(seqVars,snpOffsets,line); - } while ( line != null ); - - - java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these - // are sorted in standard reference order - - return seqVars; - - } catch ( FileNotFoundException e ) { - throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); - } catch ( IOException e ) { - throw new StingException("Error reading file "+file.getAbsolutePath()+".",e); - } - } - - private void incorporateInfo(List vars, List offsets, String plinkLine) { - if ( plinkLine == null ) { - return; - } - - String[] plinkInfo; - if ( plinkFileType == PlinkFileType.STANDARD_PED) { - plinkInfo = plinkLine.split("\t"); - } else { - throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); - } - - String individualName = plinkInfo[1]; - - int snpNumber = 0; - - if ( plinkFileType == PlinkFileType.STANDARD_PED ) { - for ( int i : offsets ) { - vars.get(snpNumber).addGenotypeEntry(plinkInfo[i], individualName); - snpNumber++; - } - } - } - - private ArrayList instantiateVariantListFromHeader(String header) { - if ( header.startsWith("#") ) {// first line is a comment; what we're used to seeing - plinkFileType = PlinkFileType.STANDARD_PED; - } else {// first line is the raw header; comes from de-binary-ing a .bed file - plinkFileType = PlinkFileType.RAW_PED; - throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); - } - - ArrayList seqVars = new ArrayList(); - String[] headerFields; - - if ( plinkFileType == PlinkFileType.STANDARD_PED ) { - headerFields = header.split("\t"); - } else { - throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); - } - - for ( String field : headerFields ) { - if ( ! headerEntries.contains(field) ) { - // not a standard header, so a variant - seqVars.add(new PlinkVariantInfo(field)); - } - } - - return seqVars; - } - - private ArrayList getSNPOffsetsFromHeader(String header) { - ArrayList offsets = new ArrayList(); - String[] headerFields; - if ( plinkFileType == PlinkFileType.STANDARD_PED ) { - headerFields = header.split("\t+"); - } else { - headerFields = header.split("\\s+"); - } - - int offset = 0; - - for ( String field : headerFields ) { - if ( ! headerEntries.contains(field) ) { - offsets.add(offset); - } - offset++; - } - - return offsets; - } - - /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** - * * PARSING BINARY PLINK BED/BIM/FAM FILES * * - * *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/ - - private ArrayList parseBinaryFormattedPlinkFile(File file) { - PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); - ArrayList parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); - ArrayList sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); - ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile); - - java.util.Collections.sort(updatedVariants); - - return updatedVariants; - } - - private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) { - // just gonna parse the name - PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta(); - String absolute_path = file.getAbsolutePath(); - String[] directory_tree = absolute_path.split("/"); - String file_name = directory_tree[directory_tree.length-1].split("\\.")[0]; - StringBuilder pathBuilder = new StringBuilder(); - for ( int i = 0; i < directory_tree.length - 1; i ++ ) { - pathBuilder.append(String.format("%s/",directory_tree[i])); - } - String path = pathBuilder.toString(); - trifecta.bedFile = new File(path+file_name+".bed"); - trifecta.bimFile = new File(path+file_name+".bim"); - trifecta.famFile = new File(path+file_name+".fam"); - - return trifecta; - - } - - private ArrayList instantiateVariantsFromBimFile(File bimFile) { - BufferedReader reader; - try { - reader = new BufferedReader( new FileReader( bimFile )); - } catch ( FileNotFoundException e) { - throw new StingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+ - "Please ensure that it is in the same directory as the .bed and .fam files. The file we "+ - "Were looking for was "+bimFile.getAbsolutePath(),e); - } - - ArrayList variants = new ArrayList(); - - try { - String line = null; - do { - line = reader.readLine(); - if ( line != null ) { - String[] snpInfo = line.split("\\s+"); - PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1],true); - variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[3]), Long.valueOf(snpInfo[3]))); - variant.setAlleles(snpInfo[4],snpInfo[5]); - variants.add(variant); - } - } while ( line != null ); - } catch ( IOException e ) { - throw new StingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e); - } - - return variants; - } - - private ArrayList getSampleNameOrderingFromFamFile(File famFile) { - BufferedReader reader; - try { - reader = new BufferedReader( new FileReader( famFile )); - } catch ( FileNotFoundException e) { - throw new StingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+ - "Please ensure that it is in the same directory as the .bed and .bim files. The file we "+ - "Were looking for was "+famFile.getAbsolutePath(),e); - } - - ArrayList sampleNames = new ArrayList(); - - try { - String line = null; - do { - line = reader.readLine(); - if ( line != null ) { - sampleNames.add(line.split("\\s+")[1]); - } - } while ( line != null ); - } catch (IOException e) { - throw new StingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e); - } - - return sampleNames; - } - - private ArrayList getGenotypesFromBedFile(ArrayList variants, ArrayList samples, File bedFile) { - FileInputStream inStream; - try { - inStream = new FileInputStream(bedFile); - } catch (FileNotFoundException e) { - throw new StingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+ - "Please ensure that it is in the same directory as the .bim and .fam files. The file we "+ - "Were looking for was "+bedFile.getAbsolutePath(),e); - } - - try { - byte genotype = -1; - long bytesRead = 0; - int snpOffset = 0; - int sampleOffset = 0; - boolean snpMajorMode = true; - do { - genotype = (byte) inStream.read(); - bytesRead++; - if ( genotype != -1 ) { - if ( bytesRead > 3 ) { - addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode); - - if ( snpMajorMode ) { - sampleOffset = sampleOffset + 4; - if ( sampleOffset > samples.size() -1 ) { - snpOffset ++; - sampleOffset = 0; - } - } else { - snpOffset = snpOffset + 4; - if ( snpOffset > variants.size() -1 ) { - sampleOffset ++; - snpOffset = 0; - } - } - - } else { - if ( bytesRead == 3) { - snpMajorMode = genotype == SNP_MAJOR_MODE; - } - } - } - } while ( genotype != -1 ); - } catch ( IOException e) { - throw new StingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e); - } - - return variants; - } - - private void addGenotypeByte(byte genotype, ArrayList variants, ArrayList sampleNames, int snpOffset, int sampleOffset, boolean major) { - // four genotypes encoded in this byte - int[] genotypes = parseGenotypes(genotype); - for ( int g : genotypes ) { - variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset)); - - if ( major ) { - sampleOffset++; - if ( sampleOffset > sampleNames.size()-1 ) { //using offsets for comparison; size 5 == offset 4 - return; - } - } else { - snpOffset++; - if( snpOffset > variants.size()-1 ) { - return; - } - } - } - } - - private int[] parseGenotypes(byte genotype) { - int[] genotypes = new int[4]; - genotypes[0] = ( genotype & 3 ); - genotypes[1] = ( ( genotype & 12 ) >>> 2 ); - genotypes[2] = ( ( genotype & 48 ) >>> 4 ); - genotypes[3] = ( ( genotype & 192 ) >>> 6 ); - return genotypes; - } -} - -class PlinkVariantInfo implements Comparable { - private String variantName; - private GenomeLoc loc; - private ArrayList genotypes = new ArrayList(); - private ArrayList sampleNames = new ArrayList(); - - private ArrayList indelHolder; - private ArrayList sampleHolder; - private int siteIndelLength; - private Allele.AlleleType indelType; - - // for binary parsing - - private String locAllele1; - private String locAllele2; - - public GenomeLoc getLocation() { - return loc; - } - - public String getName() { - return variantName; - } - - public ArrayList getSampleNames() { - return sampleNames; - } - - public ArrayList getGenotypes() { - return genotypes; - } - - public boolean isSNP() { - return this.indelType == null; - } - - public void setGenomeLoc(GenomeLoc loc) { - this.loc = loc; - } - - public void setAlleles(String al1, String al2) { - if ( al1.equals("0") ) { - // encoding for a site at which no variants were detected - locAllele1 = al2; - } else { - locAllele1 = al1; - } - locAllele2 = al2; - if ( ! isSNP() ) { - siteIndelLength = Math.max(locAllele1.length(),locAllele2.length()); - } - - } - - // CONSTRUCTOR - - public PlinkVariantInfo(String variantName) { - this.variantName = variantName; - parseName(); - } - - public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) { - this.variantName = variantName; - if ( onlyLookForIndelInfo ) { - parseNameForIndels(); - } else { - parseName(); - } - } - - private void parseName() { - String chrom = this.variantName.split("\\|c")[1].split("_")[0]; - String pos = this.variantName.split("_p")[1].split("_")[0]; - this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); - this.parseNameForIndels(); - } - - private void parseNameForIndels() { - if ( this.variantName.contains("_gI") || this.variantName.contains("_gD") ) { - this.instantiateIndel(); - } - } - - private void instantiateIndel() { - this.indelHolder = new ArrayList(); - this.sampleHolder = new ArrayList(); - this.siteIndelLength = -1; - this.indelType = this.variantName.contains("_gI") ? Allele.AlleleType.INSERTION : Allele.AlleleType.DELETION; - } - - public void addGenotypeEntry(String genotypeString, String sampleName) { - // identify if we're dealing with a deletion - if ( this.isSNP() ) { - this.addSNP(genotypeString.split("\\s+"),sampleName); - } else { - this.addIndel(genotypeString.split("\\s+"),sampleName); - } - } - - public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { - String[] alleleStr = new String[2]; - if ( genoTYPE == 0 ) { - alleleStr[0] = locAllele1; - alleleStr[1] = locAllele1; - } else if (genoTYPE == 2) { - alleleStr[0] = locAllele1; - alleleStr[1] = locAllele2; - } else if (genoTYPE == 3 ) { - alleleStr[0] = locAllele2; - alleleStr[1] = locAllele2; - } else { - alleleStr[0] = "0"; - alleleStr[1] = "0"; - } - - if ( this.isSNP() ) { - this.addSNP(alleleStr,sampleName); - } else { - this.addIndel(alleleStr,sampleName); - } - } - - private void addSNP(String[] alleleStrings, String sampleName) { - ArrayList alleles = new ArrayList(2); - - for ( String alStr : alleleStrings ) { - alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr)); - } - - genotypes.add(new Genotype(alleles,sampleName,20.0) ); - sampleNames.add(sampleName); - } - - private void addIndel(String[] alleleStrings, String sampleName) { - String alleleStr1 = alleleStrings[0]; - String alleleStr2 = alleleStrings[1]; - if ( alleleStr1.contains("-") ^ alleleStr2.contains("-") ) { - // heterozygous indel - if ( alleleStr1.contains("-") ) { - this.addHetIndel(alleleStr2,sampleName) ; - } else { - this.addHetIndel(alleleStr1,sampleName); - } - } else { - this.addHomIndel(alleleStr1, alleleStr2, sampleName); - } - } - - private void addHetIndel(String baseStr, String sampleName) { - Allele ref; - Allele alt; - - if ( indelType == Allele.AlleleType.INSERTION ) { - ref = new Allele(Allele.AlleleType.REFERENCE,""); - alt = new Allele(indelType,baseStr); - } else { - alt = new Allele(indelType,""); - ref = new Allele(Allele.AlleleType.DELETION_REFERENCE,baseStr); - } - - this.setIndelLength(alt,baseStr.length()); - - if ( ! indelHolder.isEmpty() ) { - siteIndelLength = baseStr.length(); - this.addHeldIndels(); - } - - Genotype indel = new Genotype(Arrays.asList(ref,alt), sampleName, 20.0); - this.setIndelGenotypeLength(indel,siteIndelLength); - this.genotypes.add(indel); - this.sampleNames.add(sampleName); - } - - private void addHomIndel(String strand1, String strand2, String sampleName) { - Allele allele1; - Allele allele2; - boolean reference; - if ( indelType == Allele.AlleleType.DELETION ) { - if ( strand1.contains("-") ) { - // homozygous deletion - allele1 = new Allele(indelType,""); - allele2 = new Allele(indelType,""); - reference = false; - } else { - allele1 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand1); - allele2 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand2); - reference = true; - } - } else { - if ( strand1.contains("-") ) { - // homozygous reference - allele1 = new Allele(Allele.AlleleType.REFERENCE,""); - allele2 = new Allele(Allele.AlleleType.REFERENCE,""); - reference = true; - } else { - allele1 = new Allele(indelType,strand1); - allele2 = new Allele(indelType,strand2); - reference = false; - } - } - - if ( reference ) { - if ( ! indelHolder.isEmpty() ) { - siteIndelLength = strand1.length(); - this.addHeldIndels(); - } - } - - if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site - Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0); - setIndelGenotypeLength(gen,siteIndelLength); - this.genotypes.add(gen); - this.sampleNames.add(sampleName); - } else { // hold on the variants until we *do* know the in/del length at this site - this.indelHolder.add(allele1); - this.indelHolder.add(allele2); - this.sampleHolder.add(sampleName); - } - - } - - private void setIndelGenotypeLength(Genotype g, int length) { - g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length); - } - - private void addHeldIndels() { - Allele del1; - Allele del2; - int startingSize = indelHolder.size(); - for ( int i = 0; i < startingSize ; i+=2 ) { - del1 = indelHolder.get(i); - del2 = indelHolder.get(i+1); - this.addIndelFromCache(del1,del2,sampleHolder.get(i/2)); - if ( indelHolder.size() != startingSize ) { - throw new StingException("Halting algorithm -- possible infinite loop"); - } - } - indelHolder.clear(); - sampleHolder.clear(); - } - - private void addIndelFromCache ( Allele indel1, Allele indel2, String sampleName ) { - this.setIndelLength(indel1,siteIndelLength); - this.setIndelLength(indel2,siteIndelLength); - Genotype indel = new Genotype(Arrays.asList(indel1,indel2),sampleName, 20.0); - this.setIndelGenotypeLength(indel,siteIndelLength); - this.genotypes.add(indel); - this.sampleNames.add(sampleName); - } - - public int compareTo(Object obj) { - if ( ! ( obj instanceof PlinkVariantInfo) ) { - return 1; - } - - return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); - } - - private void setIndelLength(Allele al, int length) { - // Todo -- once alleles support deletion lengths add that information - // Todo -- into the object; for now this can just return - return; - } -} - -class PlinkBinaryTrifecta { - - public PlinkBinaryTrifecta() { - - } - - public File bedFile; - public File bimFile; - public File famFile; - -} \ No newline at end of file +//package org.broadinstitute.sting.gatk.refdata; +// +//import org.broadinstitute.sting.utils.BaseUtils; +//import org.broadinstitute.sting.utils.GenomeLoc; +//import org.broadinstitute.sting.utils.GenomeLocParser; +//import org.broadinstitute.sting.utils.StingException; +//import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +//import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +//import org.broadinstitute.sting.gatk.CommandLineGATK; +// +//import java.io.*; +//import java.util.*; +// +//import net.sf.samtools.SAMFileHeader; +// +///** +// * Created by IntelliJ IDEA. +// * User: chartl +// * Date: Jan 19, 2010 +// * Time: 10:24:18 AM +// * To change this template use File | Settings | File Templates. +// */ +//public class PlinkRod extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { +// private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", +// "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); +// private final byte SNP_MAJOR_MODE = 1; +// +// private ArrayList variants; +// private PlinkVariantInfo currentVariant; +// private ListIterator variantIterator; +// +// private PlinkFileType plinkFileType; +// +// public enum PlinkFileType { +// STANDARD_PED,RAW_PED,BINARY_PED +// } +// +// // // // CONSTRUCTOR // // // +// +// public PlinkRod(String name) { +// super(name); +// } +// +// @Override +// public Object initialize(final File plinkFile) throws FileNotFoundException { +// if ( ! plinkFile.exists() ) { +// throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist."); +// } +// +// variants = parsePlinkFile(plinkFile); +// if ( variants != null ) { +// variantIterator = variants.listIterator(); +// currentVariant = variantIterator.next(); +// } +// +// assertNotNull(); +// +// return null; +// } +// +// private void assertNotNull() { +// if ( currentVariant == null ) { +// throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" ); +// } +// } +// +// @Override +// public boolean parseLine(Object obj, String[] args) { +// if ( variantIterator.hasNext() ) { +// currentVariant = variantIterator.next(); +// return true; +// } else { +// return false; +// } +// } +// +// @Override +// public GenomeLoc getLocation() { +// return currentVariant.getLocation(); +// } +// +// @Override +// public String toString() { +// return currentVariant == null ? "" : currentVariant.toString(); +// } +// +// public String getVariantName() { +// return currentVariant.getName(); +// } +// +// public ArrayList getVariantSampleNames() { +// return currentVariant.getSampleNames(); +// } +// +// public ArrayList getGenotypes() { +// return currentVariant.getGenotypes(); +// } +// +// public boolean variantIsSNP() { +// return currentVariant.isSNP(); +// } +// +// +// // AM I PARSING A TEXT OR A BINARY FILE ?? +// +// private ArrayList parsePlinkFile(File file) { +// String[] splitFileName = file.getName().split("\\."); +// String extension = splitFileName[splitFileName.length-1]; +// if ( extension.equals("ped") || extension.equals("raw") ) { +// return parseTextFormattedPlinkFile(file); +// } else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) { +// plinkFileType = PlinkFileType.BINARY_PED; +// return parseBinaryFormattedPlinkFile(file); +// } else { +// System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format"); +// return parseTextFormattedPlinkFile(file); +// } +// +// } +// +// /* *** *** *** *** *** ** *** ** *** ** *** ** *** ** *** +// * * PARSING STANDARD TEXT PED FILES * ** +// * *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/ +// +// private ArrayList parseTextFormattedPlinkFile( File file ) { +// try { +// BufferedReader reader = new BufferedReader( new FileReader ( file ) ); +// String header = reader.readLine(); +// ArrayList seqVars = instantiateVariantListFromHeader(header); +// ArrayList snpOffsets = getSNPOffsetsFromHeader(header); +// +// String line = null; +// do { +// line = reader.readLine(); +// incorporateInfo(seqVars,snpOffsets,line); +// } while ( line != null ); +// +// +// java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these +// // are sorted in standard reference order +// +// return seqVars; +// +// } catch ( FileNotFoundException e ) { +// throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); +// } catch ( IOException e ) { +// throw new StingException("Error reading file "+file.getAbsolutePath()+".",e); +// } +// } +// +// private void incorporateInfo(List vars, List offsets, String plinkLine) { +// if ( plinkLine == null ) { +// return; +// } +// +// String[] plinkInfo; +// if ( plinkFileType == PlinkFileType.STANDARD_PED) { +// plinkInfo = plinkLine.split("\t"); +// } else { +// throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); +// } +// +// String individualName = plinkInfo[1]; +// +// int snpNumber = 0; +// +// if ( plinkFileType == PlinkFileType.STANDARD_PED ) { +// for ( int i : offsets ) { +// vars.get(snpNumber).addGenotypeEntry(plinkInfo[i], individualName); +// snpNumber++; +// } +// } +// } +// +// private ArrayList instantiateVariantListFromHeader(String header) { +// if ( header.startsWith("#") ) {// first line is a comment; what we're used to seeing +// plinkFileType = PlinkFileType.STANDARD_PED; +// } else {// first line is the raw header; comes from de-binary-ing a .bed file +// plinkFileType = PlinkFileType.RAW_PED; +// throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); +// } +// +// ArrayList seqVars = new ArrayList(); +// String[] headerFields; +// +// if ( plinkFileType == PlinkFileType.STANDARD_PED ) { +// headerFields = header.split("\t"); +// } else { +// throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); +// } +// +// for ( String field : headerFields ) { +// if ( ! headerEntries.contains(field) ) { +// // not a standard header, so a variant +// seqVars.add(new PlinkVariantInfo(field)); +// } +// } +// +// return seqVars; +// } +// +// private ArrayList getSNPOffsetsFromHeader(String header) { +// ArrayList offsets = new ArrayList(); +// String[] headerFields; +// if ( plinkFileType == PlinkFileType.STANDARD_PED ) { +// headerFields = header.split("\t+"); +// } else { +// headerFields = header.split("\\s+"); +// } +// +// int offset = 0; +// +// for ( String field : headerFields ) { +// if ( ! headerEntries.contains(field) ) { +// offsets.add(offset); +// } +// offset++; +// } +// +// return offsets; +// } +// +// /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** +// * * PARSING BINARY PLINK BED/BIM/FAM FILES * * +// * *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/ +// +// private ArrayList parseBinaryFormattedPlinkFile(File file) { +// PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); +// ArrayList parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); +// ArrayList sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); +// ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile); +// +// java.util.Collections.sort(updatedVariants); +// +// return updatedVariants; +// } +// +// private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) { +// // just gonna parse the name +// PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta(); +// String absolute_path = file.getAbsolutePath(); +// String[] directory_tree = absolute_path.split("/"); +// String file_name = directory_tree[directory_tree.length-1].split("\\.")[0]; +// StringBuilder pathBuilder = new StringBuilder(); +// for ( int i = 0; i < directory_tree.length - 1; i ++ ) { +// pathBuilder.append(String.format("%s/",directory_tree[i])); +// } +// String path = pathBuilder.toString(); +// trifecta.bedFile = new File(path+file_name+".bed"); +// trifecta.bimFile = new File(path+file_name+".bim"); +// trifecta.famFile = new File(path+file_name+".fam"); +// +// return trifecta; +// +// } +// +// private ArrayList instantiateVariantsFromBimFile(File bimFile) { +// BufferedReader reader; +// try { +// reader = new BufferedReader( new FileReader( bimFile )); +// } catch ( FileNotFoundException e) { +// throw new StingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+ +// "Please ensure that it is in the same directory as the .bed and .fam files. The file we "+ +// "Were looking for was "+bimFile.getAbsolutePath(),e); +// } +// +// ArrayList variants = new ArrayList(); +// +// try { +// String line = null; +// do { +// line = reader.readLine(); +// if ( line != null ) { +// String[] snpInfo = line.split("\\s+"); +// PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1],true); +// variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[3]), Long.valueOf(snpInfo[3]))); +// variant.setAlleles(snpInfo[4],snpInfo[5]); +// variants.add(variant); +// } +// } while ( line != null ); +// } catch ( IOException e ) { +// throw new StingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e); +// } +// +// return variants; +// } +// +// private ArrayList getSampleNameOrderingFromFamFile(File famFile) { +// BufferedReader reader; +// try { +// reader = new BufferedReader( new FileReader( famFile )); +// } catch ( FileNotFoundException e) { +// throw new StingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+ +// "Please ensure that it is in the same directory as the .bed and .bim files. The file we "+ +// "Were looking for was "+famFile.getAbsolutePath(),e); +// } +// +// ArrayList sampleNames = new ArrayList(); +// +// try { +// String line = null; +// do { +// line = reader.readLine(); +// if ( line != null ) { +// sampleNames.add(line.split("\\s+")[1]); +// } +// } while ( line != null ); +// } catch (IOException e) { +// throw new StingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e); +// } +// +// return sampleNames; +// } +// +// private ArrayList getGenotypesFromBedFile(ArrayList variants, ArrayList samples, File bedFile) { +// FileInputStream inStream; +// try { +// inStream = new FileInputStream(bedFile); +// } catch (FileNotFoundException e) { +// throw new StingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+ +// "Please ensure that it is in the same directory as the .bim and .fam files. The file we "+ +// "Were looking for was "+bedFile.getAbsolutePath(),e); +// } +// +// try { +// byte genotype = -1; +// long bytesRead = 0; +// int snpOffset = 0; +// int sampleOffset = 0; +// boolean snpMajorMode = true; +// do { +// genotype = (byte) inStream.read(); +// bytesRead++; +// if ( genotype != -1 ) { +// if ( bytesRead > 3 ) { +// addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode); +// +// if ( snpMajorMode ) { +// sampleOffset = sampleOffset + 4; +// if ( sampleOffset > samples.size() -1 ) { +// snpOffset ++; +// sampleOffset = 0; +// } +// } else { +// snpOffset = snpOffset + 4; +// if ( snpOffset > variants.size() -1 ) { +// sampleOffset ++; +// snpOffset = 0; +// } +// } +// +// } else { +// if ( bytesRead == 3) { +// snpMajorMode = genotype == SNP_MAJOR_MODE; +// } +// } +// } +// } while ( genotype != -1 ); +// } catch ( IOException e) { +// throw new StingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e); +// } +// +// return variants; +// } +// +// private void addGenotypeByte(byte genotype, ArrayList variants, ArrayList sampleNames, int snpOffset, int sampleOffset, boolean major) { +// // four genotypes encoded in this byte +// int[] genotypes = parseGenotypes(genotype); +// for ( int g : genotypes ) { +// variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset)); +// +// if ( major ) { +// sampleOffset++; +// if ( sampleOffset > sampleNames.size()-1 ) { //using offsets for comparison; size 5 == offset 4 +// return; +// } +// } else { +// snpOffset++; +// if( snpOffset > variants.size()-1 ) { +// return; +// } +// } +// } +// } +// +// private int[] parseGenotypes(byte genotype) { +// int[] genotypes = new int[4]; +// genotypes[0] = ( genotype & 3 ); +// genotypes[1] = ( ( genotype & 12 ) >>> 2 ); +// genotypes[2] = ( ( genotype & 48 ) >>> 4 ); +// genotypes[3] = ( ( genotype & 192 ) >>> 6 ); +// return genotypes; +// } +//} +// +//class PlinkVariantInfo implements Comparable { +// private String variantName; +// private GenomeLoc loc; +// private ArrayList genotypes = new ArrayList(); +// private ArrayList sampleNames = new ArrayList(); +// +// private ArrayList indelHolder; +// private ArrayList sampleHolder; +// private int siteIndelLength; +// private Allele.AlleleType indelType; +// +// // for binary parsing +// +// private String locAllele1; +// private String locAllele2; +// +// public GenomeLoc getLocation() { +// return loc; +// } +// +// public String getName() { +// return variantName; +// } +// +// public ArrayList getSampleNames() { +// return sampleNames; +// } +// +// public ArrayList getGenotypes() { +// return genotypes; +// } +// +// public boolean isSNP() { +// return this.indelType == null; +// } +// +// public void setGenomeLoc(GenomeLoc loc) { +// this.loc = loc; +// } +// +// public void setAlleles(String al1, String al2) { +// if ( al1.equals("0") ) { +// // encoding for a site at which no variants were detected +// locAllele1 = al2; +// } else { +// locAllele1 = al1; +// } +// locAllele2 = al2; +// if ( ! isSNP() ) { +// siteIndelLength = Math.max(locAllele1.length(),locAllele2.length()); +// } +// +// } +// +// // CONSTRUCTOR +// +// public PlinkVariantInfo(String variantName) { +// this.variantName = variantName; +// parseName(); +// } +// +// public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) { +// this.variantName = variantName; +// if ( onlyLookForIndelInfo ) { +// parseNameForIndels(); +// } else { +// parseName(); +// } +// } +// +// private void parseName() { +// String chrom = this.variantName.split("\\|c")[1].split("_")[0]; +// String pos = this.variantName.split("_p")[1].split("_")[0]; +// this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); +// this.parseNameForIndels(); +// } +// +// private void parseNameForIndels() { +// if ( this.variantName.contains("_gI") || this.variantName.contains("_gD") ) { +// this.instantiateIndel(); +// } +// } +// +// private void instantiateIndel() { +// this.indelHolder = new ArrayList(); +// this.sampleHolder = new ArrayList(); +// this.siteIndelLength = -1; +// this.indelType = this.variantName.contains("_gI") ? Allele.AlleleType.INSERTION : Allele.AlleleType.DELETION; +// } +// +// public void addGenotypeEntry(String genotypeString, String sampleName) { +// // identify if we're dealing with a deletion +// if ( this.isSNP() ) { +// this.addSNP(genotypeString.split("\\s+"),sampleName); +// } else { +// this.addIndel(genotypeString.split("\\s+"),sampleName); +// } +// } +// +// public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { +// String[] alleleStr = new String[2]; +// if ( genoTYPE == 0 ) { +// alleleStr[0] = locAllele1; +// alleleStr[1] = locAllele1; +// } else if (genoTYPE == 2) { +// alleleStr[0] = locAllele1; +// alleleStr[1] = locAllele2; +// } else if (genoTYPE == 3 ) { +// alleleStr[0] = locAllele2; +// alleleStr[1] = locAllele2; +// } else { +// alleleStr[0] = "0"; +// alleleStr[1] = "0"; +// } +// +// if ( this.isSNP() ) { +// this.addSNP(alleleStr,sampleName); +// } else { +// this.addIndel(alleleStr,sampleName); +// } +// } +// +// private void addSNP(String[] alleleStrings, String sampleName) { +// ArrayList alleles = new ArrayList(2); +// +// for ( String alStr : alleleStrings ) { +// alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr)); +// } +// +// genotypes.add(new Genotype(alleles,sampleName,20.0) ); +// sampleNames.add(sampleName); +// } +// +// private void addIndel(String[] alleleStrings, String sampleName) { +// String alleleStr1 = alleleStrings[0]; +// String alleleStr2 = alleleStrings[1]; +// if ( alleleStr1.contains("-") ^ alleleStr2.contains("-") ) { +// // heterozygous indel +// if ( alleleStr1.contains("-") ) { +// this.addHetIndel(alleleStr2,sampleName) ; +// } else { +// this.addHetIndel(alleleStr1,sampleName); +// } +// } else { +// this.addHomIndel(alleleStr1, alleleStr2, sampleName); +// } +// } +// +// private void addHetIndel(String baseStr, String sampleName) { +// Allele ref; +// Allele alt; +// +// if ( indelType == Allele.AlleleType.INSERTION ) { +// ref = new Allele(Allele.AlleleType.REFERENCE,""); +// alt = new Allele(indelType,baseStr); +// } else { +// alt = new Allele(indelType,""); +// ref = new Allele(Allele.AlleleType.DELETION_REFERENCE,baseStr); +// } +// +// this.setIndelLength(alt,baseStr.length()); +// +// if ( ! indelHolder.isEmpty() ) { +// siteIndelLength = baseStr.length(); +// this.addHeldIndels(); +// } +// +// Genotype indel = new Genotype(Arrays.asList(ref,alt), sampleName, 20.0); +// this.setIndelGenotypeLength(indel,siteIndelLength); +// this.genotypes.add(indel); +// this.sampleNames.add(sampleName); +// } +// +// private void addHomIndel(String strand1, String strand2, String sampleName) { +// Allele allele1; +// Allele allele2; +// boolean reference; +// if ( indelType == Allele.AlleleType.DELETION ) { +// if ( strand1.contains("-") ) { +// // homozygous deletion +// allele1 = new Allele(indelType,""); +// allele2 = new Allele(indelType,""); +// reference = false; +// } else { +// allele1 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand1); +// allele2 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand2); +// reference = true; +// } +// } else { +// if ( strand1.contains("-") ) { +// // homozygous reference +// allele1 = new Allele(Allele.AlleleType.REFERENCE,""); +// allele2 = new Allele(Allele.AlleleType.REFERENCE,""); +// reference = true; +// } else { +// allele1 = new Allele(indelType,strand1); +// allele2 = new Allele(indelType,strand2); +// reference = false; +// } +// } +// +// if ( reference ) { +// if ( ! indelHolder.isEmpty() ) { +// siteIndelLength = strand1.length(); +// this.addHeldIndels(); +// } +// } +// +// if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site +// Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0); +// setIndelGenotypeLength(gen,siteIndelLength); +// this.genotypes.add(gen); +// this.sampleNames.add(sampleName); +// } else { // hold on the variants until we *do* know the in/del length at this site +// this.indelHolder.add(allele1); +// this.indelHolder.add(allele2); +// this.sampleHolder.add(sampleName); +// } +// +// } +// +// private void setIndelGenotypeLength(Genotype g, int length) { +// g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length); +// } +// +// private void addHeldIndels() { +// Allele del1; +// Allele del2; +// int startingSize = indelHolder.size(); +// for ( int i = 0; i < startingSize ; i+=2 ) { +// del1 = indelHolder.get(i); +// del2 = indelHolder.get(i+1); +// this.addIndelFromCache(del1,del2,sampleHolder.get(i/2)); +// if ( indelHolder.size() != startingSize ) { +// throw new StingException("Halting algorithm -- possible infinite loop"); +// } +// } +// indelHolder.clear(); +// sampleHolder.clear(); +// } +// +// private void addIndelFromCache ( Allele indel1, Allele indel2, String sampleName ) { +// this.setIndelLength(indel1,siteIndelLength); +// this.setIndelLength(indel2,siteIndelLength); +// Genotype indel = new Genotype(Arrays.asList(indel1,indel2),sampleName, 20.0); +// this.setIndelGenotypeLength(indel,siteIndelLength); +// this.genotypes.add(indel); +// this.sampleNames.add(sampleName); +// } +// +// public int compareTo(Object obj) { +// if ( ! ( obj instanceof PlinkVariantInfo) ) { +// return 1; +// } +// +// return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); +// } +// +// private void setIndelLength(Allele al, int length) { +// // Todo -- once alleles support deletion lengths add that information +// // Todo -- into the object; for now this can just return +// return; +// } +//} +// +//class PlinkBinaryTrifecta { +// +// public PlinkBinaryTrifecta() { +// +// } +// +// public File bedFile; +// public File bimFile; +// public File famFile; +// +//} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index 07707525a..5d55a6625 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -60,11 +60,11 @@ public class PileupWalker extends LocusWalker implements TreeR @Argument(fullName="alwaysShowSecondBase",doc="If true, prints dummy bases for the second bases in the BAM file where they are missing",required=false) public boolean alwaysShowSecondBase = false; - @Argument(fullName="qualsAsInts",doc="If true, prints out qualities in the pileup as comma-separated integers",required=false) - public boolean qualsAsInts = false; + //@Argument(fullName="qualsAsInts",doc="If true, prints out qualities in the pileup as comma-separated integers",required=false) + //public boolean qualsAsInts = false; - @Argument(fullName="ignore_uncovered_bases",shortName="skip_uncov",doc="Output nothing when a base is uncovered") - public boolean IGNORE_UNCOVERED_BASES = false; + //@Argument(fullName="ignore_uncovered_bases",shortName="skip_uncov",doc="Output nothing when a base is uncovered") + //public boolean IGNORE_UNCOVERED_BASES = false; @Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events") public boolean SHOW_INDEL_PILEUPS = false; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java new file mode 100755 index 000000000..c2e093e46 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext; + +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.Arrays; + +/** + * @author ebanks, depristo + * Types of alleles: + * + * Ref: a t C g a // C is the reference base + * + * : a t G g a // C base is a G in some individuals + * + * : a t - g a // C base is deleted w.r.t. the reference + * + * : a t CAg a // A base is inserted w.r.t. the reference sequence + * + * In these cases, where are the alleles? + * + * SNP polymorphism of C/G -> { C , G } -> C is the reference allele + * 1 base deletion of C -> { C , - } -> C is the reference allele + * 1 base insertion of A -> { - ; A } -> NULL is the reference allele + * + * Suppose I see a the following in the population: + * + * Ref: a t C g a // C is the reference base + * : a t G g a // C base is a G in some individuals + * : a t - g a // C base is deleted w.r.t. the reference + * + * How do I represent this? There are three segregating alleles: + * + * { C , G , - } + * + * Now suppose I have this more complex example: + * + * Ref: a t C g a // C is the reference base + * : a t - g a + * : a t - - a + * : a t CAg a + * + * There are actually four segregating alleles: + * + * { C g , - g, - -, and CAg } over bases 2-4 + * + * However, the molecular equivalence explicitly listed above is usually discarded, so the actual + * segregating alleles are: + * + * { C g, g, -, C a g } + * + * Critically, it should be possible to apply an allele to a reference sequence to create the + * correct haplotype sequence: + * + * Allele + reference => haplotype + * + * For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the + * Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context. + * + * Given list of alleles it's possible to determine the "type" of the variation + * + * A / C @ loc => SNP with + * - / A => INDEL + * + * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion + */ +public class Allele { + private boolean isRef = false; + private byte[] bases = null; + + public Allele(byte[] bases, boolean isRef) { + bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance + this.isRef = isRef; + + if ( bases == null ) + throw new IllegalArgumentException("Constructor: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); + + this.bases = bases; + for ( byte b : bases ) { + if ( ! BaseUtils.isRegularBase(b) ) { + throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases)); + } + } + } + + /** null allele creation method */ + public Allele(boolean isRef) { + this("", isRef); + } + + public Allele(String bases, boolean isRef) { + this(bases.getBytes(), isRef); + } + + // + // + // accessor routines + // + // + public boolean isNullAllele() { return length() == 0; } + public boolean isNonNullAllele() { return ! isNullAllele(); } + + public boolean isReference() { return isRef; } + public boolean isNonReference() { return ! isReference(); } + + + /** + * Return the DNA bases segregating in this allele. Note this isn't reference polarized, + * so the Null allele is represented by a vector of length 0 + * + * @return the segregating bases + */ + public byte[] getBases() { return bases; } + + /** + * @param other the other allele + * + * @return true if these alleles are equal + */ + public boolean equals(Allele other) { + return Arrays.equals(bases, other.getBases()); + } + + public int length() { + return bases.length; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/Genotype.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java similarity index 82% rename from java/src/org/broadinstitute/sting/gatk/refdata/Genotype.java rename to java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java index cbdb96e3a..5bfd2dbdc 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/Genotype.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata; +package org.broadinstitute.sting.oneoffprojects.variantcontext; import java.util.*; @@ -10,11 +10,6 @@ import java.util.*; * This class emcompasses all the basic information about a genotype */ public class Genotype { - - public enum StandardAttributes { - DELETION_LENGTH, INVERSION_LENGTH - } - private List alleles; private double negLog10PError; @@ -32,9 +27,9 @@ public class Genotype { } /** - * @return the alleles for this genotype - */ - public List getAlleles() { return alleles; } + * @return the alleles for this genotype + */ + public List getAlleles() { return alleles; } /** * @return the ploidy of this genotype @@ -77,10 +72,10 @@ public class Genotype { * @return true if all alleles for this genotype are SNPs or reference */ public boolean isPointGenotype() { - for ( Allele allele : alleles ) { - if ( allele.isVariant() && !allele.isSNP() ) - return false; - } +// for ( Allele allele : alleles ) { +// if ( allele.isVariant() && !allele.isSNP() ) +// return false; +// } return true; } @@ -88,21 +83,21 @@ public class Genotype { * @return true if this is a variant genotype, false if it's reference */ public boolean isVariant() { - for ( Allele allele : alleles ) { - if ( allele.isVariant() ) - return true; - } +// for ( Allele allele : alleles ) { +// if ( allele.isVariant() ) +// return true; +// } return false; } /** - * @return the -1 * log10-based error estimate - */ + * @return the -1 * log10-based error estimate + */ public double getNegLog10PError() { return negLog10PError; } /** - * @return the sample name - */ + * @return the sample name + */ public String getSample() { return sample; } /** diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java new file mode 100755 index 000000000..c9e56742a --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java @@ -0,0 +1,440 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.gatk.refdata.*; + +import java.util.*; +import org.apache.commons.jexl.*; + + +/** + * @author ebanks + *

+ * Class VariantContext + *

+ * This class represents a context that unifies one or more variants + */ +public class VariantContext { + private GenomeLoc loc; + + private Set alleles = new HashSet(); + + private Set genotypes = new HashSet(); + + private HashMap attributes = new HashMap(); + + Type type = null; + + private double negLog10PError = 0.0; // todo - fixme + + /** Have we checked this VariantContext already? */ + private boolean validatedP = false; + +// public VariantContext(VariationRod rod) { +// +// // TODO -- VariationRod should eventually require classes to implement toVariationContext() +// // TODO -- (instead of using a temporary adapter class) +// +// loc = rod.getLocation(); +// reference = new Allele(Allele.AlleleType.REFERENCE, rod.getReference()); +// +// // TODO -- populate the alleles and genotypes through an adapter +// alleles = new HashSet(); +// genotypes = new HashSet(); +// +// attributes = new HashMap(); +// } + + // --------------------------------------------------------------------------------------------------------- + // + // constructors + // + // --------------------------------------------------------------------------------------------------------- + + public VariantContext(GenomeLoc loc) { + if ( loc == null ) { throw new StingException("GenomeLoc cannot be null"); } + + this.loc = loc; + } + + protected VariantContext(VariantContext parent, Set genotypes, HashMap attributes) { + this(parent.getLocation(), parent.getAlleles(), genotypes, attributes); + } + + public VariantContext(GenomeLoc loc, Set alleles, Set genotypes, HashMap attributes) { + this(loc); + + // todo -- add extensive testing here + + // todo -- check that exactly one allele is tagged as reference + + this.alleles = new HashSet(alleles); + this.genotypes = new HashSet(genotypes); + this.attributes = new HashMap(attributes); + } + + // --------------------------------------------------------------------------------------------------------- + // + // type operations + // + // --------------------------------------------------------------------------------------------------------- + + /** + * see: http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=handbook&part=ch5&rendertype=table&id=ch5.ch5_t3 + * + * Format: + * dbSNP variation class + * Rules for assigning allele classes + * Sample allele definition + * + * Single Nucleotide Polymorphisms (SNPs)a + * Strictly defined as single base substitutions involving A, T, C, or G. + * A/T + * + * Deletion/Insertion Polymorphisms (DIPs) + * Designated using the full sequence of the insertion as one allele, and either a fully + * defined string for the variant allele or a Ò-Ó character to specify the deleted allele. + * This class will be assigned to a variation if the variation alleles are of different lengths or + * if one of the alleles is deleted (Ò-Ó). + * T/-/CCTA/G + * + * No-variation + * Reports may be submitted for segments of sequence that are assayed and determined to be invariant + * in the sample. + * (NoVariation) + * + * Mixed + * Mix of other classes + * + * + * Not currently supported: + * + * Heterozygous sequencea + * The term heterozygous is used to specify a region detected by certain methods that do not + * resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking + * sequence must be provided to define a sequence context for the variation. + * (heterozygous) + * + * Microsatellite or short tandem repeat (STR) + * Alleles are designated by providing the repeat motif and the copy number for each allele. + * Expansion of the allele repeat motif designated in dbSNP into full-length sequence will + * be only an approximation of the true genomic sequence because many microsatellite markers are + * not fully sequenced and are resolved as size variants only. + * (CAC)8/9/10/11 + * + * Named variant + * Applies to insertion/deletion polymorphisms of longer sequence features, such as retroposon + * dimorphism for Alu or line elements. These variations frequently include a deletion Ò-Ó indicator + * for the absent allele. + * (alu) / - + * + * Multi-Nucleotide Polymorphism (MNP) + * Assigned to variations that are multi-base variations of a single, common length + * GGA/AGT + */ + + public enum Type { + NO_VARIATION, + SNP, + INDEL, + MIXED + } + + /** + * convenience method for switching over the allele type + * + * @return the AlleleType of this allele + **/ + public Type getType() { + if ( type == null ) + determineType(); + + return type; + } + + /** + * convenience method for SNPs + * + * @return true if this is a SNP, false otherwise + */ + public boolean isSNP() { return getType() == Type.SNP; } + + /** + * convenience method for variants + * + * @return true if this is a variant allele, false if it's reference + */ + public boolean isVariant() { return getType() != Type.NO_VARIATION; } + + + /** + * convenience method for indels + * + * @return true if this is an indel, false otherwise + */ + public boolean isIndel() { return getType() == Type.INDEL; } + + + /** + * convenience method for indels + * + * @return true if this is an mixed variation, false otherwise + */ + public boolean isMixed() { return getType() == Type.MIXED; } + + // --------------------------------------------------------------------------------------------------------- + // + // Generic accessors + // + // --------------------------------------------------------------------------------------------------------- + + /** + * @return the location of this context + */ + public GenomeLoc getLocation() { return loc; } + + + // --------------------------------------------------------------------------------------------------------- + // + // Working with alleles + // + // --------------------------------------------------------------------------------------------------------- + + /** + * @return the reference allele for this context + */ + public Allele getReference() { + for ( Allele allele : getAlleles() ) + if ( allele.isReference() ) + return allele; + + throw new StingException("BUG: no reference allele found at " + this); + } + + /** + * @return true if the context is strictly bi-allelic + */ + public boolean isBiallelic() { + //return getAlternateAlleles().size() == 1; + return getAlleles().size() == 2; + } + + /** + * Gets the alleles. This method should return all of the alleles present at the location, + * including the reference allele. There are no constraints imposed on the ordering of alleles + * in the set. If the reference is not an allele in this context it will not be included. + * + * @return the set of alleles + */ + public Set getAlleles() { return alleles; } + + /** + * Gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference allele. There are no constraints imposed on the ordering of alleles + * in the set. + * + * @return the set of alternate alleles + */ + public Set getAlternateAlleles() { + HashSet altAlleles = new HashSet(); + for ( Allele allele : alleles ) { + if ( allele.isNonReference() ) + altAlleles.add(allele); + } + + return altAlleles; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Working with genotypes + // + // --------------------------------------------------------------------------------------------------------- + + /** + * @return true if the context represents variants with associated genotypes + */ + public boolean hasGenotypes() { return genotypes.size() > 0; } + + /** + * @return set of all Genotypes associated with this context + */ + + // todo -- genotypes should really be stored as map, not set + public Set getGenotypes() { return genotypes; } + + public Map getGenotypeMap() { + HashMap map = new HashMap(); + for ( Genotype g : genotypes ) + map.put(g.getSample(), g); + return map; + } + + + /** + * @return the set of all sample names in this context + */ + public Set getSampleNames() { + return getGenotypeMap().keySet(); + } + + /** + * @param sample the sample name + * + * @return the Genotype associated with the given sample in this context or null if the sample is not in this context + */ + public Genotype getGenotype(String sample) { + return getGenotypeMap().get(sample); + } + + // --------------------------------------------------------------------------------------------------------- + // + // Working with attributes + // + // --------------------------------------------------------------------------------------------------------- + + // todo -- refactor into AttributedObject and have VariantContext and Genotype inherit from them + + // todo -- define common attributes as enum + + /** + * Sets the given attribute + * + * @param key the attribute key + * @param value the attribute value + */ + public void putAttribute(Object key, Object value) { + attributes.put(key, value); + } + + public void putAttributes(Map map) { + attributes.putAll(map); + } + + public boolean hasAttribute(Object key) { + return attributes.containsKey(key); + } + + public int getNumAttributes() { + return attributes.size(); + } + + /** + * @param key the attribute key + * + * @return the attribute value for the given key (or null if not set) + */ + public Object getAttribute(Object key) { + return attributes.get(key); + } + + public Object getAttribute(Object key, Object defaultValue) { + if ( hasAttribute(key) ) + return attributes.get(key); + else + return defaultValue; + } + + public String getAttributeAsString(Object key) { return (String)getAttribute(key); } + public int getAttributeAsInt(Object key) { return (Integer)getAttribute(key); } + public double getAttributeAsDouble(Object key) { return (Double)getAttribute(key); } + + public String getAttributeAsString(Object key, String defaultValue) { return (String)getAttribute(key, defaultValue); } + public int getAttributeAsInt(Object key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); } + public double getAttributeAsDouble(Object key, double defaultValue) { return (Double)getAttribute(key, defaultValue); } + + /** + * @return the attribute map + */ + public Map getAttributes() { + return attributes; + } + + // --------------------------------------------------------------------------------------------------------- + // + // validation + // + // --------------------------------------------------------------------------------------------------------- + + /** + * To be called by any modifying routines + */ + private void invalidate() { validatedP = false; } + + public boolean validate() { + return validate(true); + } + + public boolean validate(boolean throwException) { + if ( ! validatedP ) { + boolean valid = false; + // todo -- add extensive validation checking here + if ( valid ) { + validatedP = valid; + } else if ( throwException ) { + throw new StingException(this + " failed validation"); + } + + return valid; + } else { + return validatedP; + } + } + + // --------------------------------------------------------------------------------------------------------- + // + // utility routines + // + // --------------------------------------------------------------------------------------------------------- + + private void determineType() { + if ( type == null ) { + // todo -- figure out the variation type + } + } + + // todo -- toString() method + + /** + * @return true if the context represents point alleles only (i.e. no indels or structural variants) + */ +// public boolean isPointAllele() { +// for ( Allele allele : alleles ) { +// if ( allele.isVariant() && !allele.isSNP() ) +// return false; +// } +// return true; +// } +// + +// /** +// * @return set of all subclasses within this context +// */ +// public Set getSubclasses() { +// Set subclasses = new HashSet(); +// for ( Genotype g : genotypes ) +// subclasses.addAll(g.getAttributes().keySet()); +// return subclasses; +// } + + /** + * @param allele the allele to be queried + * + * @return the frequency of the given allele in this context + */ + public double getAlleleFrequency(Allele allele) { + int alleleCount = 0; + int totalCount = 0; + + for ( Genotype g : genotypes ) { + for ( Allele a : g.getAlleles() ) { + totalCount++; + if ( allele.equals(a) ) + alleleCount++; + } + } + + return totalCount == 0 ? 0.0 : (double)alleleCount / (double)totalCount; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java new file mode 100755 index 000000000..f3842377b --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext; + +import java.util.*; +import org.apache.commons.jexl.*; + + +public class VariantContextUtils { + private static final String UNIQUIFIED_SUFFIX = ".unique"; + + /** + * @param other another variant context + * + * throws an exception if there is a collision such that the same sample exists in both contexts + * @return a context representing the merge of this context and the other provided context + */ + public VariantContext merge(VariantContext left, VariantContext other) { + return merge(left, other, false); + } + + /** + * @param other another variant context + * @param uniquifySamples if true and there is a collision such that the same sample exists in both contexts, + * the samples will be uniquified(based on their sources); + * otherwise, an exception will be thrown + * + * @return a context representing the merge of this context and the other provided context + */ + public VariantContext merge(VariantContext left, VariantContext other, boolean uniquifySamples) { + // todo -- make functional + + if ( !left.getLocation().equals(other.getLocation()) ) + throw new IllegalArgumentException("The locations must be identical for two contexts to be merged"); + + Set samples = left.getSampleNames(); + Set Gs = new HashSet(left.getGenotypes()); + + for ( Genotype g : other.getGenotypes() ) { + if ( samples.contains(g.getSample()) ) { + if ( uniquifySamples ) + g.setSample(g.getSample() + UNIQUIFIED_SUFFIX); + else + throw new IllegalStateException("The same sample name exists in both contexts when attempting to merge"); + } + Gs.add(g); + } + + HashMap attrs = new HashMap(left.getAttributes()); + attrs.putAll(other.getAttributes()); + + return new VariantContext(left, Gs, attrs); + } + + + /** + * @param subclass the name of a subclass of variants to select + * + * @return a subset of this context which selects based on the given subclass + */ +// public VariantContextUtils select(String subclass) { +// HashSet Gs = new HashSet(); +// for ( Genotype g : genotypes ) { +// if ( g.getAttribute(subclass) != null ) +// Gs.add(g); +// } +// return createNewContext(Gs, attributes); +// } + + /** + * @param expr a jexl expression describing how to filter this context + * + * @return a subset of this context which is filtered based on the given expression + */ +// public VariantContextUtils filter(String expr) { +// HashSet Gs = new HashSet(); +// try { +// Expression filterExpression = ExpressionFactory.createExpression(expr); +// +// for ( Genotype g : genotypes ) { +// JexlContext jContext = JexlHelper.createContext(); +// jContext.setVars(g.getAttributes()); +// if ( (Boolean)filterExpression.evaluate(jContext) ) +// Gs.add(g); +// } +// +// } catch (Exception e) { +// throw new StingException("JEXL error in VariantContext: " + e.getMessage()); +// } +// +// return createNewContext(Gs, attributes); +// } + + /** + * @return a set of new variant contexts, one for each sample from this context + */ +// public Set splitBySample() { +// Set contexts = new HashSet(); +// for ( Genotype g : genotypes ) { +// HashSet gAsSet = new HashSet(); +// gAsSet.add(g); +// contexts.add(createNewContext(gAsSet, attributes)); +// } +// return contexts; +// } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java index 6b85a228c..509abfbbc 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java @@ -1,243 +1,243 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.Assert; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.BufferedReader; -import java.io.FileReader; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: Jan 22, 2010 - * Time: 11:27:33 PM - * To change this template use File | Settings | File Templates. - */ -public class PlinkRodTest extends BaseTest { - private static IndexedFastaSequenceFile seq; - - @BeforeClass - public static void beforeTests() { - try { - seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); - } catch (FileNotFoundException e) { - throw new StingException("unable to load the sequence dictionary"); - } - GenomeLocParser.setupRefContigOrdering(seq); - - } - - public BufferedReader openFile(String filename) { - try { - return new BufferedReader(new FileReader(filename)); - } catch (FileNotFoundException e) { - throw new StingException("Couldn't open file " + filename); - } - - } - - @Test - public void testStandardPedFile() { - PlinkRod rod = new PlinkRod("test"); - try { - rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") ); - } catch ( FileNotFoundException e ) { - throw new StingException("test file for testStandardPedFile() does not exist",e); - } - - // test that the sample names are correct - - List rodNames = rod.getVariantSampleNames(); - List expectedNames = Arrays.asList("NA12887","NAMELY","COWBA"); - - Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size()); - - boolean namesCorrect = true; - for ( int i = 0; i < expectedNames.size(); i++ ) { - namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) ); - } - - Assert.assertTrue("That the names are correct and in the proper order",namesCorrect); - - // test that rod can be iterated over - - ArrayList> genotypesInRod = new ArrayList>(); - ArrayList> sampleNamesInRod = new ArrayList>(); - ArrayList lociInRod = new ArrayList(); - do { - genotypesInRod.add(rod.getGenotypes()); - sampleNamesInRod.add(rod.getVariantSampleNames()); - lociInRod.add(rod.getLocation()); - } while ( rod.parseLine(null,null) ); - - Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size()); - - ArrayList snp1 = genotypesInRod.get(0); - ArrayList snp3 = genotypesInRod.get(2); - - Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size()); - Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size()); - Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size()); - - - List snp1_individual3_alleles = snp1.get(2).getAlleles(); - List snp3_individual2_alleles = snp3.get(1).getAlleles(); - - String alleleStr1 = snp1_individual3_alleles.get(0).getBases()+snp1_individual3_alleles.get(1).getBases(); - String alleleStr2 = snp3_individual2_alleles.get(0).getBases()+snp3_individual2_alleles.get(1).getBases(); - - Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1); - Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2); - - boolean name2isSame = true; - - for ( ArrayList names : sampleNamesInRod ) { - name2isSame = name2isSame && names.get(1).equals("NAMELY"); - } - - Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame); - - // test that the loci are correctly parsed and in order - - List expectedLoci = Arrays.asList("1:123456","2:13274","3:11111"); - boolean lociCorrect = true; - for ( int i = 0; i < 3; i ++ ) { - lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i)); - } - } - - @Test - public void testStandardPedFileWithIndels() { - PlinkRod rod = new PlinkRod("test"); - try { - rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") ); - } catch ( FileNotFoundException e) { - throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e); - } - - // Iterate through the rod - - List> genotypesInRod = new ArrayList>(); - ArrayList> sampleNamesInRod = new ArrayList>(); - ArrayList lociInRod = new ArrayList(); - ArrayList snpSites = new ArrayList(); - do { - genotypesInRod.add(rod.getGenotypes()); - sampleNamesInRod.add(rod.getVariantSampleNames()); - lociInRod.add(rod.getLocation()); - snpSites.add(rod.variantIsSNP()); - } while ( rod.parseLine(null,null) ); - - boolean snpOrder = true; - List expectedOrder = Arrays.asList(true,false,true,false); - for ( int i = 0; i < 4; i ++ ) { - snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) ); - } - - Assert.assertTrue("That the variant type order is as expected", snpOrder); - Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1).isPointGenotype() ); - Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() ); - Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom()); - Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet()); - Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases()); - Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases()); - Assert.assertEquals("That the snp bases are correct","GC",genotypesInRod.get(2).get(2).getAlleles().get(0).getBases()+genotypesInRod.get(2).get(2).getAlleles().get(1).getBases()); - } - - @Test - public void testBinaryPedFileNoIndels() { - PlinkRod rod = new PlinkRod("binaryTest1"); - try { - rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_noindel_test.bed")); - } catch (FileNotFoundException e) { - throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e); - } - - // iterate through the ROD and get stuff - ArrayList lociInRod = new ArrayList(); - ArrayList> genotypesInRod = new ArrayList>(); - ArrayList> samplesInRod = new ArrayList>(); - - do { - lociInRod.add(rod.getLocation()); - genotypesInRod.add(rod.getGenotypes()); - samplesInRod.add(rod.getVariantSampleNames()); - } while ( rod.parseLine(null,null) ); - - List expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713"); - - for ( int i = 0; i < expecLoc.size(); i ++ ) { - Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString()); - } - - List expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT", - "GG","GG","AG","TT","CC","CT","TG","GG","GG"); - List expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false, - false,false,false,true,false,false,true,true,false,false); - List expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000", - "NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000", - "NA12878","NA12890","NA07000"); - int snpNo = 1; - int indiv = 1; - int alleleOffset = 0; - for ( ArrayList snp : genotypesInRod ) { - for ( Genotype gen : snp ) { - String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases(); - Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+ - "(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr); - Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet()); - Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1)); - indiv++; - alleleOffset++; - } - indiv = 1; - snpNo++; - } - } - - @Test - public void testIndelBinary() { - PlinkRod rod = new PlinkRod("binaryTest2"); - try { - rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_indel_test.bed")); - } catch (FileNotFoundException e) { - throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e); - } - - ArrayList> genotypesInRod = new ArrayList>(); - do { - genotypesInRod.add(rod.getGenotypes()); - } while ( rod.parseLine(null,null) ); - - List expecAlleles = Arrays.asList("ACCA","","ACCAACCA","GGGG","GG","","AA","TA","00","","CCTCCT","CCT", - "TC","CC","TT","GG","GG","AG","","CTTGCTTG","CTTG","TG","GG","GG"); - List expecDeletion = Arrays.asList(false,false,false,false,false,false,false,false,false,true,false,true, - false,false,false,false,false,false,true,false,true,false,false,false); - List expecInsertion = Arrays.asList(true,false,true,true,true,false,false,false,false,false,false,false, - false,false,false,false,false,false,false,false,false,false,false,false); - - int al = 0; - for ( ArrayList snp : genotypesInRod ) { - for ( Genotype gen : snp ) { - String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases(); - Allele firstAl = gen.getAlleles().get(0); - Allele secondAl = gen.getAlleles().get(1); - boolean isInsertion = ( firstAl.getType() == Allele.AlleleType.INSERTION || secondAl.getType() == Allele.AlleleType.INSERTION ); - boolean isDeletion = ( firstAl.getType() == Allele.AlleleType.DELETION || secondAl.getType() == Allele.AlleleType.DELETION ); - Assert.assertEquals("That the indel file has the correct alleles for genotype "+al,expecAlleles.get(al), alStr); - Assert.assertEquals("That the insertion property of genotype "+al+" is correct",expecInsertion.get(al),isInsertion); - Assert.assertEquals("That the deletion property of genotype "+al+" is correct", expecDeletion.get(al), isDeletion); - al++; - } - } - } -} +//package org.broadinstitute.sting.gatk.refdata; +// +//import org.broadinstitute.sting.BaseTest; +//import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +//import org.broadinstitute.sting.utils.StingException; +//import org.broadinstitute.sting.utils.GenomeLocParser; +//import org.broadinstitute.sting.utils.GenomeLoc; +//import org.junit.BeforeClass; +//import org.junit.Test; +//import org.junit.Assert; +// +//import java.io.File; +//import java.io.FileNotFoundException; +//import java.io.BufferedReader; +//import java.io.FileReader; +//import java.util.*; +// +///** +// * Created by IntelliJ IDEA. +// * User: Ghost +// * Date: Jan 22, 2010 +// * Time: 11:27:33 PM +// * To change this template use File | Settings | File Templates. +// */ +//public class PlinkRodTest extends BaseTest { +// private static IndexedFastaSequenceFile seq; +// +// @BeforeClass +// public static void beforeTests() { +// try { +// seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); +// } catch (FileNotFoundException e) { +// throw new StingException("unable to load the sequence dictionary"); +// } +// GenomeLocParser.setupRefContigOrdering(seq); +// +// } +// +// public BufferedReader openFile(String filename) { +// try { +// return new BufferedReader(new FileReader(filename)); +// } catch (FileNotFoundException e) { +// throw new StingException("Couldn't open file " + filename); +// } +// +// } +// +// @Test +// public void testStandardPedFile() { +// PlinkRod rod = new PlinkRod("test"); +// try { +// rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") ); +// } catch ( FileNotFoundException e ) { +// throw new StingException("test file for testStandardPedFile() does not exist",e); +// } +// +// // test that the sample names are correct +// +// List rodNames = rod.getVariantSampleNames(); +// List expectedNames = Arrays.asList("NA12887","NAMELY","COWBA"); +// +// Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size()); +// +// boolean namesCorrect = true; +// for ( int i = 0; i < expectedNames.size(); i++ ) { +// namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) ); +// } +// +// Assert.assertTrue("That the names are correct and in the proper order",namesCorrect); +// +// // test that rod can be iterated over +// +// ArrayList> genotypesInRod = new ArrayList>(); +// ArrayList> sampleNamesInRod = new ArrayList>(); +// ArrayList lociInRod = new ArrayList(); +// do { +// genotypesInRod.add(rod.getGenotypes()); +// sampleNamesInRod.add(rod.getVariantSampleNames()); +// lociInRod.add(rod.getLocation()); +// } while ( rod.parseLine(null,null) ); +// +// Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size()); +// +// ArrayList snp1 = genotypesInRod.get(0); +// ArrayList snp3 = genotypesInRod.get(2); +// +// Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size()); +// Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size()); +// Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size()); +// +// +// List snp1_individual3_alleles = snp1.get(2).getAlleles(); +// List snp3_individual2_alleles = snp3.get(1).getAlleles(); +// +// String alleleStr1 = snp1_individual3_alleles.get(0).getBases()+snp1_individual3_alleles.get(1).getBases(); +// String alleleStr2 = snp3_individual2_alleles.get(0).getBases()+snp3_individual2_alleles.get(1).getBases(); +// +// Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1); +// Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2); +// +// boolean name2isSame = true; +// +// for ( ArrayList names : sampleNamesInRod ) { +// name2isSame = name2isSame && names.get(1).equals("NAMELY"); +// } +// +// Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame); +// +// // test that the loci are correctly parsed and in order +// +// List expectedLoci = Arrays.asList("1:123456","2:13274","3:11111"); +// boolean lociCorrect = true; +// for ( int i = 0; i < 3; i ++ ) { +// lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i)); +// } +// } +// +// @Test +// public void testStandardPedFileWithIndels() { +// PlinkRod rod = new PlinkRod("test"); +// try { +// rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") ); +// } catch ( FileNotFoundException e) { +// throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e); +// } +// +// // Iterate through the rod +// +// List> genotypesInRod = new ArrayList>(); +// ArrayList> sampleNamesInRod = new ArrayList>(); +// ArrayList lociInRod = new ArrayList(); +// ArrayList snpSites = new ArrayList(); +// do { +// genotypesInRod.add(rod.getGenotypes()); +// sampleNamesInRod.add(rod.getVariantSampleNames()); +// lociInRod.add(rod.getLocation()); +// snpSites.add(rod.variantIsSNP()); +// } while ( rod.parseLine(null,null) ); +// +// boolean snpOrder = true; +// List expectedOrder = Arrays.asList(true,false,true,false); +// for ( int i = 0; i < 4; i ++ ) { +// snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) ); +// } +// +// Assert.assertTrue("That the variant type order is as expected", snpOrder); +// Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1).isPointGenotype() ); +// Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() ); +// Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom()); +// Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet()); +// Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases()); +// Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases()); +// Assert.assertEquals("That the snp bases are correct","GC",genotypesInRod.get(2).get(2).getAlleles().get(0).getBases()+genotypesInRod.get(2).get(2).getAlleles().get(1).getBases()); +// } +// +// @Test +// public void testBinaryPedFileNoIndels() { +// PlinkRod rod = new PlinkRod("binaryTest1"); +// try { +// rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_noindel_test.bed")); +// } catch (FileNotFoundException e) { +// throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e); +// } +// +// // iterate through the ROD and get stuff +// ArrayList lociInRod = new ArrayList(); +// ArrayList> genotypesInRod = new ArrayList>(); +// ArrayList> samplesInRod = new ArrayList>(); +// +// do { +// lociInRod.add(rod.getLocation()); +// genotypesInRod.add(rod.getGenotypes()); +// samplesInRod.add(rod.getVariantSampleNames()); +// } while ( rod.parseLine(null,null) ); +// +// List expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713"); +// +// for ( int i = 0; i < expecLoc.size(); i ++ ) { +// Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString()); +// } +// +// List expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT", +// "GG","GG","AG","TT","CC","CT","TG","GG","GG"); +// List expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false, +// false,false,false,true,false,false,true,true,false,false); +// List expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000", +// "NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000", +// "NA12878","NA12890","NA07000"); +// int snpNo = 1; +// int indiv = 1; +// int alleleOffset = 0; +// for ( ArrayList snp : genotypesInRod ) { +// for ( Genotype gen : snp ) { +// String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases(); +// Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+ +// "(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr); +// Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet()); +// Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1)); +// indiv++; +// alleleOffset++; +// } +// indiv = 1; +// snpNo++; +// } +// } +// +// @Test +// public void testIndelBinary() { +// PlinkRod rod = new PlinkRod("binaryTest2"); +// try { +// rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_indel_test.bed")); +// } catch (FileNotFoundException e) { +// throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e); +// } +// +// ArrayList> genotypesInRod = new ArrayList>(); +// do { +// genotypesInRod.add(rod.getGenotypes()); +// } while ( rod.parseLine(null,null) ); +// +// List expecAlleles = Arrays.asList("ACCA","","ACCAACCA","GGGG","GG","","AA","TA","00","","CCTCCT","CCT", +// "TC","CC","TT","GG","GG","AG","","CTTGCTTG","CTTG","TG","GG","GG"); +// List expecDeletion = Arrays.asList(false,false,false,false,false,false,false,false,false,true,false,true, +// false,false,false,false,false,false,true,false,true,false,false,false); +// List expecInsertion = Arrays.asList(true,false,true,true,true,false,false,false,false,false,false,false, +// false,false,false,false,false,false,false,false,false,false,false,false); +// +// int al = 0; +// for ( ArrayList snp : genotypesInRod ) { +// for ( Genotype gen : snp ) { +// String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases(); +// Allele firstAl = gen.getAlleles().get(0); +// Allele secondAl = gen.getAlleles().get(1); +// boolean isInsertion = ( firstAl.getType() == Allele.AlleleType.INSERTION || secondAl.getType() == Allele.AlleleType.INSERTION ); +// boolean isDeletion = ( firstAl.getType() == Allele.AlleleType.DELETION || secondAl.getType() == Allele.AlleleType.DELETION ); +// Assert.assertEquals("That the indel file has the correct alleles for genotype "+al,expecAlleles.get(al), alStr); +// Assert.assertEquals("That the insertion property of genotype "+al+" is correct",expecInsertion.get(al),isInsertion); +// Assert.assertEquals("That the deletion property of genotype "+al+" is correct", expecDeletion.get(al), isDeletion); +// al++; +// } +// } +// } +//}