diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java new file mode 100644 index 000000000..7bc30c9b7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java @@ -0,0 +1,651 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Jan 19, 2010 + * Time: 10:24:18 AM + * To change this template use File | Settings | File Templates. + */ +public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { + + private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", + "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); + private final byte SNP_MAJOR_MODE = 0x00000001; + + private ArrayList variants; + private PlinkVariantInfo currentVariant; + private ListIterator variantIterator; + + + private PlinkFileType plinkFileType; + + public enum PlinkFileType { + STANDARD_PED,RAW_PED,BINARY_PED + } + + // // // CONSTRUCTOR // // // + + public PlinkRodWithGenomeLoc(String name) { + super(name); + } + + @Override + public Object initialize(final File plinkFile) throws FileNotFoundException { + if ( ! plinkFile.exists() ) { + throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist."); + } + + variants = parsePlinkFile(plinkFile); + if ( variants != null ) { + variantIterator = variants.listIterator(); + currentVariant = variantIterator.next(); + } + + assertNotNull(); + + return null; + } + + private void assertNotNull() { + if ( currentVariant == null ) { + throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" ); + } + } + + @Override + public boolean parseLine(Object obj, String[] args) { + if ( variantIterator.hasNext() ) { + currentVariant = variantIterator.next(); + return true; + } else { + return false; + } + } + + @Override + public GenomeLoc getLocation() { + return currentVariant.getLocation(); + } + + @Override + public String toString() { + return currentVariant == null ? "" : currentVariant.toString(); + } + + public String getVariantName() { + return currentVariant.getName(); + } + + public ArrayList getVariantSampleNames() { + return currentVariant.getSampleNames(); + } + + public ArrayList getGenotypes() { + return currentVariant.getGenotypes(); + } + + // AM I PARSING A TEXT OR A BINARY FILE ?? + + private ArrayList parsePlinkFile(File file) { + String[] splitFileName = file.getName().split("."); + String extension = splitFileName[splitFileName.length-1]; + if ( extension.equals("ped") || extension.equals("raw") ) { + return parseTextFormattedPlinkFile(file); + } else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) { + plinkFileType = PlinkFileType.BINARY_PED; + return parseBinaryFormattedPlinkFile(file); + } else { + System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format"); + return parseTextFormattedPlinkFile(file); + } + + } + + /* *** *** *** *** *** ** *** ** *** ** *** ** *** ** *** + * * PARSING STANDARD TEXT PED FILES * ** + * *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/ + + private ArrayList parseTextFormattedPlinkFile( File file ) { + try { + BufferedReader reader = new BufferedReader( new FileReader ( file ) ); + String header = reader.readLine(); + ArrayList seqVars = instantiateVariantListFromHeader(header); + ArrayList snpOffsets = getSNPOffsetsFromHeader(header); + + String line = null; + do { + line = reader.readLine(); + incorporateInfo(seqVars,snpOffsets,line); + } while ( line != null ); + + java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these + // are sorted in standard reference order + + return seqVars; + + } catch ( FileNotFoundException e ) { + throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); + } catch ( IOException e ) { + throw new StingException("Error reading file "+file.getAbsolutePath()+".",e); + } + } + + private void incorporateInfo(List vars, List offsets, String plinkLine) { + String[] plinkInfo; + if ( plinkFileType == PlinkFileType.STANDARD_PED) { + plinkInfo = plinkLine.split("\t"); + } else { + throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); + } + + String individualName = plinkInfo[1]; + + int snpNumber = 0; + + if ( plinkFileType == PlinkFileType.STANDARD_PED ) { + for ( int i : offsets ) { + vars.get(snpNumber).addGenotypeEntry(plinkInfo[i], individualName); + snpNumber++; + } + } + } + + private ArrayList instantiateVariantListFromHeader(String header) { + if ( header.startsWith("#") ) {// first line is a comment; what we're used to seeing + plinkFileType = PlinkFileType.STANDARD_PED; + } else {// first line is the raw header; comes from de-binary-ing a .bed file + plinkFileType = PlinkFileType.RAW_PED; + throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); + } + + ArrayList seqVars = new ArrayList(); + String[] headerFields; + + if ( plinkFileType == PlinkFileType.STANDARD_PED ) { + headerFields = header.split("\t"); + } else { + throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); + } + + for ( String field : headerFields ) { + if ( ! headerEntries.contains(field) ) { + // not a standard header, so a variant + seqVars.add(new PlinkVariantInfo(field)); + } + } + + return seqVars; + } + + private ArrayList getSNPOffsetsFromHeader(String header) { + ArrayList offsets = new ArrayList(); + String[] headerFields; + if ( plinkFileType == PlinkFileType.STANDARD_PED ) { + headerFields = header.split("\t"); + } else { + headerFields = header.split("\\s+"); + } + + int offset = 0; + + for ( String field : headerFields ) { + if ( ! headerEntries.contains(field) ) { + offsets.add(offset); + } + offset++; + } + + return offsets; + } + + /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** + * * PARSING BINARY PLINK BED/BIM/FAM FILES * * + * *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/ + + private ArrayList parseBinaryFormattedPlinkFile(File file) { + PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); + ArrayList parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); + ArrayList sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); + ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile); + return updatedVariants; + } + + private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) { + // just gonna parse the name + PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta(); + String absolute_path = file.getAbsolutePath(); + String[] directory_tree = absolute_path.split("/"); + String file_name = directory_tree[directory_tree.length-1].split(".")[0]; + StringBuilder pathBuilder = new StringBuilder(); + for ( String folder : directory_tree ) { + pathBuilder.append(String.format("%s/",folder)); + } + String path = pathBuilder.toString(); + trifecta.bedFile = new File(path+file_name+".bed"); + trifecta.bimFile = new File(path+file_name+".bim"); + trifecta.famFile = new File(path+file_name+".fam"); + + return trifecta; + + } + + private ArrayList instantiateVariantsFromBimFile(File bimFile) { + BufferedReader reader; + try { + reader = new BufferedReader( new FileReader( bimFile )); + } catch ( FileNotFoundException e) { + throw new StingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+ + "Please ensure that it is in the same directory as the .bed and .fam files. The file we "+ + "Were looking for was "+bimFile.getAbsolutePath(),e); + } + + ArrayList variants = new ArrayList(); + + try { + String line = null; + do { + line = reader.readLine(); + if ( line != null ) { + String[] snpInfo = line.split("\\s+"); + PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1],true); + variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[2]), Long.valueOf(snpInfo[2])+1)); + variant.setAlleles(snpInfo[4],snpInfo[5]); + variants.add(variant); + } + } while ( line != null ); + } catch ( IOException e ) { + throw new StingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e); + } + + return variants; + } + + private ArrayList getSampleNameOrderingFromFamFile(File famFile) { + BufferedReader reader; + try { + reader = new BufferedReader( new FileReader( famFile )); + } catch ( FileNotFoundException e) { + throw new StingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+ + "Please ensure that it is in the same directory as the .bed and .bim files. The file we "+ + "Were looking for was "+famFile.getAbsolutePath(),e); + } + + ArrayList sampleNames = new ArrayList(); + + try { + String line = null; + do { + line = reader.readLine(); + if ( line != null ) { + sampleNames.add(line.split("\\s+")[1]); + } + } while ( line != null ); + } catch (IOException e) { + throw new StingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e); + } + + return sampleNames; + } + + private ArrayList getGenotypesFromBedFile(ArrayList variants, ArrayList samples, File bedFile) { + FileInputStream inStream; + try { + inStream = new FileInputStream(bedFile); + } catch (FileNotFoundException e) { + throw new StingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+ + "Please ensure that it is in the same directory as the .bim and .fam files. The file we "+ + "Were looking for was "+bedFile.getAbsolutePath(),e); + } + + try { + byte genotype = -1; + long bytesRead = 0; + int snpOffset = 0; + int sampleOffset = 0; + boolean snpMajorMode = true; + do { + genotype = (byte) inStream.read(); + bytesRead++; + if ( genotype != -1 ) { + if ( bytesRead > 3 ) { + addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode); + + if ( snpMajorMode ) { + sampleOffset = sampleOffset + 4; + while ( sampleOffset > samples.size() ) { + snpOffset ++; + sampleOffset = sampleOffset % samples.size(); + } + } else { + snpOffset = snpOffset + 4; + while ( snpOffset > variants.size() ) { + sampleOffset ++; + snpOffset = snpOffset % samples.size(); + } + } + + } else { + if ( bytesRead == 2) { + snpMajorMode = genotype == SNP_MAJOR_MODE; + } + } + } + } while ( genotype != -1 ); + } catch ( IOException e) { + throw new StingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e); + } + + return variants; + } + + private void addGenotypeByte(byte genotype, ArrayList variants, ArrayList sampleNames, int snpOffset, int sampleOffset, boolean major) { + // four genotypes encoded in this byte + int[] genotypes = parseGenotypes(genotype); + for ( int g : genotypes ) { + variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset)); + + if ( major ) { + sampleOffset++; + while ( sampleOffset > sampleNames.size() ) { + snpOffset++; + sampleOffset = sampleOffset % sampleNames.size(); + } + if ( snpOffset >= variants.size() ) { + // done with file; early return + return; + } + } else { + snpOffset++; + while( snpOffset > variants.size() ) { + sampleOffset++; + snpOffset = snpOffset % variants.size(); + } + if ( sampleOffset >= sampleNames.size() ) { + // done with file; early return + return; + } + } + } + } + + private int[] parseGenotypes(byte genotype) { + int[] genotypes = new int[4]; + genotypes[0] = ( genotype & 0x00000011 ); + genotypes[1] = ( ( genotype & 0x00001100 ) >>> 2 ); + genotypes[2] = ( ( genotype & 0x00110000 ) >>> 4 ); + genotypes[3] = ( ( genotype & 0x11000000 ) >>> 6 ); + return genotypes; + } +} + +class PlinkVariantInfo implements Comparable { + private String variantName; + private GenomeLoc loc; + private ArrayList genotypes = new ArrayList(); + private ArrayList sampleNames = new ArrayList(); + + private ArrayList indelHolder; + private ArrayList sampleHolder; + private int siteIndelLength; + private Allele.AlleleType indelType; + + // for binary parsing + + private String locAllele1; + private String locAllele2; + + public GenomeLoc getLocation() { + return loc; + } + + public String getName() { + return variantName; + } + + public ArrayList getSampleNames() { + return sampleNames; + } + + public ArrayList getGenotypes() { + return genotypes; + } + + private boolean isSNP() { + return this.indelType == null; + } + + public void setGenomeLoc(GenomeLoc loc) { + this.loc = loc; + } + + public void setAlleles(String al1, String al2) { + locAllele1 = al1; + locAllele2 = al2; + if ( ! isSNP() ) { + siteIndelLength = Math.max(locAllele1.length(),locAllele2.length()); + } + } + + // CONSTRUCTOR + + public PlinkVariantInfo(String variantName) { + this.variantName = variantName; + } + + public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) { + this.variantName = variantName; + if ( onlyLookForIndelInfo ) { + parseNameForIndels(); + } else { + parseName(); + } + } + + private void parseName() { + String chrom = this.variantName.split("_c")[1].split("_")[0]; + String pos = this.variantName.split("_p")[1].split("_")[0]; + this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); + this.parseNameForIndels(); + } + + private void parseNameForIndels() { + if ( this.variantName.contains("_gI") || this.variantName.contains("_gD") ) { + this.instantiateIndel(); + } + } + + private void instantiateIndel() { + this.indelHolder = new ArrayList(); + this.sampleHolder = new ArrayList(); + this.siteIndelLength = -1; + this.indelType = this.variantName.contains("_gI") ? Allele.AlleleType.INSERTION : Allele.AlleleType.DELETION; + } + + public void addGenotypeEntry(String genotypeString, String sampleName) { + // identify if we're dealing with a deletion + if ( this.isSNP() ) { + this.addSNP(genotypeString.split("\\s+"),sampleName); + } else { + this.addIndel(genotypeString.split("\\s+"),sampleName); + } + } + + public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { + String[] alleleStr = new String[2]; + if ( genoTYPE == 0 ) { + alleleStr[0] = locAllele1; + alleleStr[1] = locAllele1; + } else if (genoTYPE == 1) { + alleleStr[0] = locAllele1; + alleleStr[1] = locAllele2; + } else { + alleleStr[0] = locAllele2; + alleleStr[1] = locAllele2; + } + + if ( this.isSNP() ) { + this.addSNP(alleleStr,sampleName); + } else { + this.addIndel(alleleStr,sampleName); + } + } + + private void addSNP(String[] alleleStrings, String sampleName) { + ArrayList alleles = new ArrayList(2); + + for ( String alStr : alleleStrings ) { + alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr)); + } + } + + private void addIndel(String[] alleleStrings, String sampleName) { + String alleleStr1 = alleleStrings[0]; + String alleleStr2 = alleleStrings[1]; + if ( alleleStr1.contains("-") ^ alleleStr2.contains("-") ) { + // heterozygous indel + if ( alleleStr1.contains("-") ) { + this.addHetIndel(alleleStr2,sampleName) ; + } else { + this.addHetIndel(alleleStr1,sampleName); + } + } else { + this.addHomIndel(alleleStr1, alleleStr2, sampleName); + } + } + + private void addHetIndel(String baseStr, String sampleName) { + Allele ref; + Allele alt; + + if ( indelType == Allele.AlleleType.INSERTION ) { + ref = new Allele(Allele.AlleleType.REFERENCE,""); + alt = new Allele(indelType,baseStr); + } else { + alt = new Allele(indelType,""); + ref = new Allele(Allele.AlleleType.REFERENCE,baseStr); + } + + this.setIndelLength(alt,baseStr.length()); + + if ( ! indelHolder.isEmpty() ) { + siteIndelLength = baseStr.length(); + this.addHeldIndels(); + } + + Genotype indel = new Genotype(Arrays.asList(ref,alt), sampleName, 20.0); + this.setIndelGenotypeLength(indel,siteIndelLength); + this.genotypes.add(indel); + this.sampleNames.add(sampleName); + } + + private void addHomIndel(String strand1, String strand2, String sampleName) { + Allele allele1; + Allele allele2; + boolean reference; + if ( indelType == Allele.AlleleType.DELETION ) { + if ( strand1.contains("-") ) { + // homozygous deletion + allele1 = new Allele(indelType,""); + allele2 = new Allele(indelType,""); + reference = false; + } else { + allele1 = new Allele(Allele.AlleleType.REFERENCE,strand1); + allele2 = new Allele(Allele.AlleleType.REFERENCE,strand2); + reference = true; + } + } else { + if ( strand1.contains("-") ) { + // homozygous reference + allele1 = new Allele(Allele.AlleleType.REFERENCE,""); + allele2 = new Allele(Allele.AlleleType.REFERENCE,""); + reference = true; + } else { + allele1 = new Allele(indelType,strand1); + allele2 = new Allele(indelType,strand2); + reference = false; + } + } + + if ( reference ) { + if ( ! indelHolder.isEmpty() ) { + siteIndelLength = strand1.length(); + this.addHeldIndels(); + } + } + + if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site + Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0); + this.genotypes.add(gen); + this.sampleNames.add(sampleName); + } else { // hold on the variants until we *do* know the in/del length at this site + this.indelHolder.add(allele1); + this.indelHolder.add(allele2); + this.sampleHolder.add(sampleName); + } + + } + + private void setIndelGenotypeLength(Genotype g, int length) { + g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length); + } + + private void addHeldIndels() { + Allele del1; + Allele del2; + int startingSize = indelHolder.size(); + for ( int i = 0; i < startingSize ; i+=2 ) { + del1 = indelHolder.get(i); + del2 = indelHolder.get(i+1); + this.addIndelFromCache(del1,del2,sampleHolder.get(i/2)); + if ( indelHolder.size() != startingSize ) { + throw new StingException("Halting algorithm -- possible infinite loop"); + } + } + indelHolder.clear(); + sampleHolder.clear(); + } + + private void addIndelFromCache ( Allele indel1, Allele indel2, String sampleName ) { + this.setIndelLength(indel1,siteIndelLength); + this.setIndelLength(indel2,siteIndelLength); + Genotype indel = new Genotype(Arrays.asList(indel1,indel2),sampleName, 20.0); + this.setIndelGenotypeLength(indel,siteIndelLength); + this.genotypes.add(indel); + this.sampleNames.add(sampleName); + } + + public int compareTo(Object obj) { + if ( ! ( obj instanceof PlinkVariantInfo) ) { + return 1; + } + + return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); + } + + private void setIndelLength(Allele al, int length) { + // Todo -- once alleles support deletion lengths add that information + // Todo -- into the object; for now this can just return + return; + } +} + +class PlinkBinaryTrifecta { + + public PlinkBinaryTrifecta() { + + } + + public File bedFile; + public File bimFile; + public File famFile; + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SequenomRodWithGenomeLoc.java b/java/src/org/broadinstitute/sting/gatk/refdata/SequenomRodWithGenomeLoc.java deleted file mode 100644 index 57a4230c3..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SequenomRodWithGenomeLoc.java +++ /dev/null @@ -1,307 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.StingException; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Jan 19, 2010 - * Time: 10:24:18 AM - * To change this template use File | Settings | File Templates. - */ -public class SequenomRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { - private final String[] SEQUENOM_HEADER_FIELDS = { "#Family ID", "Individual ID", "Paternal ID", "Maternal ID", "Sex", "Phenotype" } ; - - private ArrayList variants; - private SequenomVariantInfo currentVariant; - private ListIterator variantIterator; - private HashSet headerEntries; - - // // // CONSTRUCTOR // // // - - public SequenomRodWithGenomeLoc(String name) { - super(name); - } - - @Override - public Object initialize(final File seqFile) throws FileNotFoundException { - if ( ! seqFile.exists() ) { - throw new FileNotFoundException("File "+seqFile.getAbsolutePath()+" does not exist."); - } - - headerEntries = new HashSet(Arrays.asList(SEQUENOM_HEADER_FIELDS)); - - variants = parseSequenomFile(seqFile); - if ( variants != null ) { - variantIterator = variants.listIterator(); - currentVariant = variantIterator.next(); - } - - assertNotNull(); - - return null; - } - - private void assertNotNull() { - if ( currentVariant == null ) { - throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" ); - } - } - - @Override - public boolean parseLine(Object obj, String[] args) { - if ( variantIterator.hasNext() ) { - currentVariant = variantIterator.next(); - return true; - } else { - return false; - } - } - - @Override - public GenomeLoc getLocation() { - return currentVariant.getLocation(); - } - - @Override - public String toString() { - return currentVariant == null ? "" : currentVariant.toString(); - } - - public String getVariantName() { - return currentVariant.getName(); - } - - public ArrayList getVariantSampleNames() { - return currentVariant.getSampleNames(); - } - - public ArrayList getGenotypes() { - return currentVariant.getGenotypes(); - } - - private ArrayList parseSequenomFile(File file) { - try { - BufferedReader reader = new BufferedReader( new FileReader ( file ) ); - String header = reader.readLine(); - ArrayList seqVars = instantiateVariantListFromHeader(header); - ArrayList snpOffsets = getSNPOffsetsFromHeader(header); - - String line = null; - do { - line = reader.readLine(); - incorporateInfo(seqVars,snpOffsets,line); - } while ( line != null ); - - java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these - // are sorted in standard reference order - - return seqVars; - - } catch ( FileNotFoundException e ) { - throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); - } catch ( IOException e ) { - throw new StingException("Error reading file "+file.getAbsolutePath()+".",e); - } - } - - private void incorporateInfo(List vars, List offsets, String seqLine) { - String[] genotypes = seqLine.split("\t"); - String individualName = genotypes[1]; - - int snpNumber = 0; - for ( int i : offsets ) { - vars.get(snpNumber).addGenotypeEntry(genotypes[i], individualName); - snpNumber++; - } - } - - private ArrayList instantiateVariantListFromHeader(String header) { - ArrayList seqVars = new ArrayList(); - String[] headerFields = header.split("\t"); - - for ( String field : headerFields ) { - if ( ! headerEntries.contains(field) ) { - // not a standard header, so a variant - seqVars.add(new SequenomVariantInfo(field)); - } - } - - return seqVars; - } - - private ArrayList getSNPOffsetsFromHeader(String header) { - ArrayList offsets = new ArrayList(); - String[] headerFields = header.split("\t"); - - int offset = 0; - for ( String field : headerFields ) { - if ( ! headerEntries.contains(field) ) { - offsets.add(offset); - } - offset++; - } - - return offsets; - } -} - -class SequenomVariantInfo implements Comparable { - private String variantName; - private GenomeLoc loc; - private ArrayList genotypes; - private ArrayList sampleNames; - - private ArrayList deletionHolder = new ArrayList(); - private ArrayList sampleHolder = new ArrayList(); - private int siteDeletionLength = -1; - - public GenomeLoc getLocation() { - return loc; - } - - public String getName() { - return variantName; - } - - public ArrayList getSampleNames() { - return sampleNames; - } - - public ArrayList getGenotypes() { - return genotypes; - } - - // CONSTRUCTOR - - public SequenomVariantInfo(String variantName) { - this.variantName = variantName; - this.parseNameToLoc(); - } - - private void parseNameToLoc() { - String chrom = this.variantName.split("_c")[1].split("_")[0]; - String pos = this.variantName.split("_p")[1].split("_")[0]; - this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); - } - - public void addGenotypeEntry(String genotypeString, String sampleName) { - String[] alleleStrs = genotypeString.split(" "); - // identify if we're dealing with a deletion - if ( genotypeString.contains("-") ) { - this.addDeletion(alleleStrs, sampleName); - } else { - // simple SNP or indel (easier to handle) - this.addIndelOrSNP(alleleStrs,sampleName); - } - } - - private void addIndelOrSNP(String[] alleleStrings, String sampleName) { - ArrayList alleles = new ArrayList(2); - - if ( alleleStrings[0].length() > 1 || alleleStrings[1].length() > 1 ) { - // insertion - for ( String alStr : alleleStrings ) { - if ( alStr.length() > 1 ) { - alleles.add(new Allele(Allele.AlleleType.INSERTION,alStr)); - } else { - alleles.add(new Allele(Allele.AlleleType.REFERENCE, alStr)); - } - } - } else { - // SNP - for ( String alStr : alleleStrings ) { - alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr)); - } - } - } - - private void addDeletion(String[] alleleStrings, String sampleName) { - String alleleStr1 = alleleStrings[0]; - String alleleStr2 = alleleStrings[1]; - Allele allele1 = null; - Allele allele2 = null; - - if ( alleleStr1.contains("-") && alleleStr2.contains("-") ) { - // homozygous deletion - this.addHomDeletion(allele1,allele2, sampleName); - } else { - // heterozygous deletion - if ( alleleStr1.contains("-") ) { - this.addHetDeletion(allele1,allele2, alleleStr1, alleleStr2, sampleName); - } else { - this.addHetDeletion(allele2,allele1, alleleStr2, alleleStr1, sampleName); // note the order change - } - } - } - - private void addHetDeletion(Allele del, Allele ref, String delStr, String refStr, String sampleName) { - del = new Allele(Allele.AlleleType.DELETION,""); - ref = new Allele(Allele.AlleleType.REFERENCE,refStr.substring(0,1)); - this.setDeletionLength(del,refStr.length()); - if ( ! deletionHolder.isEmpty() ) { - siteDeletionLength = refStr.length(); - this.addHeldDeletions(); - } - Genotype indel = new Genotype(Arrays.asList(ref,del), sampleName, 20.0); - this.setIndelGenotypeLength(indel,siteDeletionLength); - this.genotypes.add(indel); - this.sampleNames.add(sampleName); - } - - private void addHomDeletion(Allele allele1, Allele allele2, String sampleName) { - allele1 = new Allele(Allele.AlleleType.DELETION,""); - allele2 = new Allele(Allele.AlleleType.DELETION,""); - if ( siteDeletionLength != -1 ) { - this.setDeletionLength(allele1,siteDeletionLength); - this.setDeletionLength(allele2,siteDeletionLength); - Genotype indel = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0); - this.setIndelGenotypeLength(indel, siteDeletionLength); - this.genotypes.add(indel); - this.sampleNames.add(sampleName); - } else { - deletionHolder.add(allele1); - deletionHolder.add(allele2); - sampleHolder.add(sampleName); - } - } - - private void setIndelGenotypeLength(Genotype g, int length) { - g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length); - } - - private void addHeldDeletions() { - Allele del1; - Allele del2; - int startingSize = deletionHolder.size(); - for ( int i = 0; i < startingSize ; i+=2 ) { - del1 = deletionHolder.get(i); - del2 = deletionHolder.get(i+1); - this.addHomDeletion(del1,del2,sampleHolder.get(i/2)); - if ( deletionHolder.size() != startingSize ) { - throw new StingException("Halting algorithm -- possible infinite loop"); - } - } - deletionHolder.clear(); - sampleHolder.clear(); - } - - public int compareTo(Object obj) { - if ( ! ( obj instanceof SequenomVariantInfo ) ) { - return 1; - } - - return loc.compareTo(((SequenomVariantInfo) obj).getLocation()); - } - - private void setDeletionLength(Allele al, int length) { - // Todo -- once alleles support deletion lengths add that information - // Todo -- into the object; for now this can just return - return; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java index 1ce83da14..384acc284 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java @@ -40,7 +40,8 @@ public class PlinkToVCF extends RefWalker { @Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false) public double maxHomNonref = 1.1; - private final Set HEADER_FIELDS = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype")); + private final Set HEADER_FIELDS = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype", + "FID","IID","PAT","MAT","SEX","PHENOTYPE")); private final int INIT_NUMBER_OF_POPULATIONS = 10; private final int DEFAULT_QUALITY = 20; private HashMap sequenomResults = new HashMap();