package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.CommandLineGATK; import java.io.*; import java.util.*; import net.sf.samtools.SAMFileHeader; /** * Created by IntelliJ IDEA. * User: chartl * Date: Jan 19, 2010 * Time: 10:24:18 AM * To change this template use File | Settings | File Templates. */ public class PlinkRod extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); private final byte SNP_MAJOR_MODE = 1; private ArrayList variants; private PlinkVariantInfo currentVariant; private ListIterator variantIterator; private PlinkFileType plinkFileType; public enum PlinkFileType { STANDARD_PED,RAW_PED,BINARY_PED } // // // CONSTRUCTOR // // // public PlinkRod(String name) { super(name); } @Override public Object initialize(final File plinkFile) throws FileNotFoundException { if ( ! plinkFile.exists() ) { throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist."); } variants = parsePlinkFile(plinkFile); if ( variants != null ) { variantIterator = variants.listIterator(); currentVariant = variantIterator.next(); } assertNotNull(); return null; } private void assertNotNull() { if ( currentVariant == null ) { throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" ); } } @Override public boolean parseLine(Object obj, String[] args) { if ( variantIterator.hasNext() ) { currentVariant = variantIterator.next(); return true; } else { return false; } } @Override public GenomeLoc getLocation() { return currentVariant.getLocation(); } @Override public String toString() { return currentVariant == null ? "" : currentVariant.toString(); } public String getVariantName() { return currentVariant.getName(); } public ArrayList getVariantSampleNames() { return currentVariant.getSampleNames(); } public ArrayList getGenotypes() { return currentVariant.getGenotypes(); } public boolean variantIsSNP() { return currentVariant.isSNP(); } // AM I PARSING A TEXT OR A BINARY FILE ?? private ArrayList parsePlinkFile(File file) { String[] splitFileName = file.getName().split("\\."); String extension = splitFileName[splitFileName.length-1]; if ( extension.equals("ped") || extension.equals("raw") ) { return parseTextFormattedPlinkFile(file); } else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) { plinkFileType = PlinkFileType.BINARY_PED; return parseBinaryFormattedPlinkFile(file); } else { System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format"); return parseTextFormattedPlinkFile(file); } } /* *** *** *** *** *** ** *** ** *** ** *** ** *** ** *** * * PARSING STANDARD TEXT PED FILES * ** * *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/ private ArrayList parseTextFormattedPlinkFile( File file ) { try { BufferedReader reader = new BufferedReader( new FileReader ( file ) ); String header = reader.readLine(); ArrayList seqVars = instantiateVariantListFromHeader(header); ArrayList snpOffsets = getSNPOffsetsFromHeader(header); String line = null; do { line = reader.readLine(); incorporateInfo(seqVars,snpOffsets,line); } while ( line != null ); java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these // are sorted in standard reference order return seqVars; } catch ( FileNotFoundException e ) { throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); } catch ( IOException e ) { throw new StingException("Error reading file "+file.getAbsolutePath()+".",e); } } private void incorporateInfo(List vars, List offsets, String plinkLine) { if ( plinkLine == null ) { return; } String[] plinkInfo; if ( plinkFileType == PlinkFileType.STANDARD_PED) { plinkInfo = plinkLine.split("\t"); } else { throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); } String individualName = plinkInfo[1]; int snpNumber = 0; if ( plinkFileType == PlinkFileType.STANDARD_PED ) { for ( int i : offsets ) { vars.get(snpNumber).addGenotypeEntry(plinkInfo[i], individualName); snpNumber++; } } } private ArrayList instantiateVariantListFromHeader(String header) { if ( header.startsWith("#") ) {// first line is a comment; what we're used to seeing plinkFileType = PlinkFileType.STANDARD_PED; } else {// first line is the raw header; comes from de-binary-ing a .bed file plinkFileType = PlinkFileType.RAW_PED; throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); } ArrayList seqVars = new ArrayList(); String[] headerFields; if ( plinkFileType == PlinkFileType.STANDARD_PED ) { headerFields = header.split("\t"); } else { throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); } for ( String field : headerFields ) { if ( ! headerEntries.contains(field) ) { // not a standard header, so a variant seqVars.add(new PlinkVariantInfo(field)); } } return seqVars; } private ArrayList getSNPOffsetsFromHeader(String header) { ArrayList offsets = new ArrayList(); String[] headerFields; if ( plinkFileType == PlinkFileType.STANDARD_PED ) { headerFields = header.split("\t+"); } else { headerFields = header.split("\\s+"); } int offset = 0; for ( String field : headerFields ) { if ( ! headerEntries.contains(field) ) { offsets.add(offset); } offset++; } return offsets; } /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** * * PARSING BINARY PLINK BED/BIM/FAM FILES * * * *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/ private ArrayList parseBinaryFormattedPlinkFile(File file) { PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); ArrayList parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); ArrayList sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile); java.util.Collections.sort(updatedVariants); return updatedVariants; } private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) { // just gonna parse the name PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta(); String absolute_path = file.getAbsolutePath(); String[] directory_tree = absolute_path.split("/"); String file_name = directory_tree[directory_tree.length-1].split("\\.")[0]; StringBuilder pathBuilder = new StringBuilder(); for ( int i = 0; i < directory_tree.length - 1; i ++ ) { pathBuilder.append(String.format("%s/",directory_tree[i])); } String path = pathBuilder.toString(); trifecta.bedFile = new File(path+file_name+".bed"); trifecta.bimFile = new File(path+file_name+".bim"); trifecta.famFile = new File(path+file_name+".fam"); return trifecta; } private ArrayList instantiateVariantsFromBimFile(File bimFile) { BufferedReader reader; try { reader = new BufferedReader( new FileReader( bimFile )); } catch ( FileNotFoundException e) { throw new StingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+ "Please ensure that it is in the same directory as the .bed and .fam files. The file we "+ "Were looking for was "+bimFile.getAbsolutePath(),e); } ArrayList variants = new ArrayList(); try { String line = null; do { line = reader.readLine(); if ( line != null ) { String[] snpInfo = line.split("\\s+"); PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1],true); variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[3]), Long.valueOf(snpInfo[3]))); variant.setAlleles(snpInfo[4],snpInfo[5]); variants.add(variant); } } while ( line != null ); } catch ( IOException e ) { throw new StingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e); } return variants; } private ArrayList getSampleNameOrderingFromFamFile(File famFile) { BufferedReader reader; try { reader = new BufferedReader( new FileReader( famFile )); } catch ( FileNotFoundException e) { throw new StingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+ "Please ensure that it is in the same directory as the .bed and .bim files. The file we "+ "Were looking for was "+famFile.getAbsolutePath(),e); } ArrayList sampleNames = new ArrayList(); try { String line = null; do { line = reader.readLine(); if ( line != null ) { sampleNames.add(line.split("\\s+")[1]); } } while ( line != null ); } catch (IOException e) { throw new StingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e); } return sampleNames; } private ArrayList getGenotypesFromBedFile(ArrayList variants, ArrayList samples, File bedFile) { FileInputStream inStream; try { inStream = new FileInputStream(bedFile); } catch (FileNotFoundException e) { throw new StingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+ "Please ensure that it is in the same directory as the .bim and .fam files. The file we "+ "Were looking for was "+bedFile.getAbsolutePath(),e); } try { byte genotype = -1; long bytesRead = 0; int snpOffset = 0; int sampleOffset = 0; boolean snpMajorMode = true; do { genotype = (byte) inStream.read(); bytesRead++; if ( genotype != -1 ) { if ( bytesRead > 3 ) { addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode); if ( snpMajorMode ) { sampleOffset = sampleOffset + 4; if ( sampleOffset > samples.size() -1 ) { snpOffset ++; sampleOffset = 0; } } else { snpOffset = snpOffset + 4; if ( snpOffset > variants.size() -1 ) { sampleOffset ++; snpOffset = 0; } } } else { if ( bytesRead == 3) { snpMajorMode = genotype == SNP_MAJOR_MODE; } } } } while ( genotype != -1 ); } catch ( IOException e) { throw new StingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e); } return variants; } private void addGenotypeByte(byte genotype, ArrayList variants, ArrayList sampleNames, int snpOffset, int sampleOffset, boolean major) { // four genotypes encoded in this byte int[] genotypes = parseGenotypes(genotype); for ( int g : genotypes ) { variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset)); if ( major ) { sampleOffset++; if ( sampleOffset > sampleNames.size()-1 ) { //using offsets for comparison; size 5 == offset 4 return; } } else { snpOffset++; if( snpOffset > variants.size()-1 ) { return; } } } } private int[] parseGenotypes(byte genotype) { int[] genotypes = new int[4]; genotypes[0] = ( genotype & 3 ); genotypes[1] = ( ( genotype & 12 ) >>> 2 ); genotypes[2] = ( ( genotype & 48 ) >>> 4 ); genotypes[3] = ( ( genotype & 192 ) >>> 6 ); return genotypes; } } class PlinkVariantInfo implements Comparable { private String variantName; private GenomeLoc loc; private ArrayList genotypes = new ArrayList(); private ArrayList sampleNames = new ArrayList(); private ArrayList indelHolder; private ArrayList sampleHolder; private int siteIndelLength; private Allele.AlleleType indelType; // for binary parsing private String locAllele1; private String locAllele2; public GenomeLoc getLocation() { return loc; } public String getName() { return variantName; } public ArrayList getSampleNames() { return sampleNames; } public ArrayList getGenotypes() { return genotypes; } public boolean isSNP() { return this.indelType == null; } public void setGenomeLoc(GenomeLoc loc) { this.loc = loc; } public void setAlleles(String al1, String al2) { if ( al1.equals("0") ) { // encoding for a site at which no variants were detected locAllele1 = al2; } else { locAllele1 = al1; } locAllele2 = al2; if ( ! isSNP() ) { siteIndelLength = Math.max(locAllele1.length(),locAllele2.length()); } } // CONSTRUCTOR public PlinkVariantInfo(String variantName) { this.variantName = variantName; parseName(); } public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) { this.variantName = variantName; if ( onlyLookForIndelInfo ) { parseNameForIndels(); } else { parseName(); } } private void parseName() { String chrom = this.variantName.split("\\|c")[1].split("_")[0]; String pos = this.variantName.split("_p")[1].split("_")[0]; this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); this.parseNameForIndels(); } private void parseNameForIndels() { if ( this.variantName.contains("_gI") || this.variantName.contains("_gD") ) { this.instantiateIndel(); } } private void instantiateIndel() { this.indelHolder = new ArrayList(); this.sampleHolder = new ArrayList(); this.siteIndelLength = -1; this.indelType = this.variantName.contains("_gI") ? Allele.AlleleType.INSERTION : Allele.AlleleType.DELETION; } public void addGenotypeEntry(String genotypeString, String sampleName) { // identify if we're dealing with a deletion if ( this.isSNP() ) { this.addSNP(genotypeString.split("\\s+"),sampleName); } else { this.addIndel(genotypeString.split("\\s+"),sampleName); } } public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { String[] alleleStr = new String[2]; if ( genoTYPE == 0 ) { alleleStr[0] = locAllele1; alleleStr[1] = locAllele1; } else if (genoTYPE == 2) { alleleStr[0] = locAllele1; alleleStr[1] = locAllele2; } else if (genoTYPE == 3 ) { alleleStr[0] = locAllele2; alleleStr[1] = locAllele2; } else { alleleStr[0] = "0"; alleleStr[1] = "0"; } if ( this.isSNP() ) { this.addSNP(alleleStr,sampleName); } else { this.addIndel(alleleStr,sampleName); } } private void addSNP(String[] alleleStrings, String sampleName) { ArrayList alleles = new ArrayList(2); for ( String alStr : alleleStrings ) { alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr)); } genotypes.add(new Genotype(alleles,sampleName,20.0) ); sampleNames.add(sampleName); } private void addIndel(String[] alleleStrings, String sampleName) { String alleleStr1 = alleleStrings[0]; String alleleStr2 = alleleStrings[1]; if ( alleleStr1.contains("-") ^ alleleStr2.contains("-") ) { // heterozygous indel if ( alleleStr1.contains("-") ) { this.addHetIndel(alleleStr2,sampleName) ; } else { this.addHetIndel(alleleStr1,sampleName); } } else { this.addHomIndel(alleleStr1, alleleStr2, sampleName); } } private void addHetIndel(String baseStr, String sampleName) { Allele ref; Allele alt; if ( indelType == Allele.AlleleType.INSERTION ) { ref = new Allele(Allele.AlleleType.REFERENCE,""); alt = new Allele(indelType,baseStr); } else { alt = new Allele(indelType,""); ref = new Allele(Allele.AlleleType.DELETION_REFERENCE,baseStr); } this.setIndelLength(alt,baseStr.length()); if ( ! indelHolder.isEmpty() ) { siteIndelLength = baseStr.length(); this.addHeldIndels(); } Genotype indel = new Genotype(Arrays.asList(ref,alt), sampleName, 20.0); this.setIndelGenotypeLength(indel,siteIndelLength); this.genotypes.add(indel); this.sampleNames.add(sampleName); } private void addHomIndel(String strand1, String strand2, String sampleName) { Allele allele1; Allele allele2; boolean reference; if ( indelType == Allele.AlleleType.DELETION ) { if ( strand1.contains("-") ) { // homozygous deletion allele1 = new Allele(indelType,""); allele2 = new Allele(indelType,""); reference = false; } else { allele1 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand1); allele2 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand2); reference = true; } } else { if ( strand1.contains("-") ) { // homozygous reference allele1 = new Allele(Allele.AlleleType.REFERENCE,""); allele2 = new Allele(Allele.AlleleType.REFERENCE,""); reference = true; } else { allele1 = new Allele(indelType,strand1); allele2 = new Allele(indelType,strand2); reference = false; } } if ( reference ) { if ( ! indelHolder.isEmpty() ) { siteIndelLength = strand1.length(); this.addHeldIndels(); } } if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0); setIndelGenotypeLength(gen,siteIndelLength); this.genotypes.add(gen); this.sampleNames.add(sampleName); } else { // hold on the variants until we *do* know the in/del length at this site this.indelHolder.add(allele1); this.indelHolder.add(allele2); this.sampleHolder.add(sampleName); } } private void setIndelGenotypeLength(Genotype g, int length) { g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length); } private void addHeldIndels() { Allele del1; Allele del2; int startingSize = indelHolder.size(); for ( int i = 0; i < startingSize ; i+=2 ) { del1 = indelHolder.get(i); del2 = indelHolder.get(i+1); this.addIndelFromCache(del1,del2,sampleHolder.get(i/2)); if ( indelHolder.size() != startingSize ) { throw new StingException("Halting algorithm -- possible infinite loop"); } } indelHolder.clear(); sampleHolder.clear(); } private void addIndelFromCache ( Allele indel1, Allele indel2, String sampleName ) { this.setIndelLength(indel1,siteIndelLength); this.setIndelLength(indel2,siteIndelLength); Genotype indel = new Genotype(Arrays.asList(indel1,indel2),sampleName, 20.0); this.setIndelGenotypeLength(indel,siteIndelLength); this.genotypes.add(indel); this.sampleNames.add(sampleName); } public int compareTo(Object obj) { if ( ! ( obj instanceof PlinkVariantInfo) ) { return 1; } return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); } private void setIndelLength(Allele al, int length) { // Todo -- once alleles support deletion lengths add that information // Todo -- into the object; for now this can just return return; } } class PlinkBinaryTrifecta { public PlinkBinaryTrifecta() { } public File bedFile; public File bimFile; public File famFile; }