diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SequenomRodWithGenomeLoc.java b/java/src/org/broadinstitute/sting/gatk/refdata/SequenomRodWithGenomeLoc.java new file mode 100644 index 000000000..69fae36c9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SequenomRodWithGenomeLoc.java @@ -0,0 +1,208 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Jan 19, 2010 + * Time: 10:24:18 AM + * To change this template use File | Settings | File Templates. + */ +public class SequenomRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { + private final String[] SEQUENOM_HEADER_FIELDS = { "#Family ID", "Individual ID", "Paternal ID", "Maternal ID", "Sex", "Phenotype" } ; + + private ArrayList variants; + private SequenomVariantInfo currentVariant; + private ListIterator variantIterator; + private HashSet headerEntries; + + // // // CONSTRUCTOR // // // + + public SequenomRodWithGenomeLoc(String name) { + super(name); + } + + @Override + public Object initialize(final File seqFile) throws FileNotFoundException { + if ( ! seqFile.exists() ) { + throw new FileNotFoundException("File "+seqFile.getAbsolutePath()+" does not exist."); + } + + headerEntries = new HashSet(Arrays.asList(SEQUENOM_HEADER_FIELDS)); + + variants = parseSequenomFile(seqFile); + if ( variants != null ) { + variantIterator = variants.listIterator(); + currentVariant = variantIterator.next(); + } + + assertNotNull(); + + return null; + } + + private void assertNotNull() { + if ( currentVariant == null ) { + throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" ); + } + } + + @Override + public boolean parseLine(Object obj, String[] args) { + if ( variantIterator.hasNext() ) { + currentVariant = variantIterator.next(); + return true; + } else { + return false; + } + } + + @Override + public GenomeLoc getLocation() { + return currentVariant.getLocation(); + } + + @Override + public String toString() { + return currentVariant == null ? "" : currentVariant.toString(); + } + + public String getVariantName() { + return currentVariant.getName(); + } + + public ArrayList getVariantSampleNames() { + return currentVariant.getSampleNames(); + } + + public ArrayList getGenotypes() { + return currentVariant.getGenotypes(); + } + + private ArrayList parseSequenomFile(File file) { + try { + BufferedReader reader = new BufferedReader( new FileReader ( file ) ); + String header = reader.readLine(); + ArrayList seqVars = instantiateVariantListFromHeader(header); + ArrayList snpOffsets = getSNPOffsetsFromHeader(header); + + String line = null; + do { + line = reader.readLine(); + incorporateInfo(seqVars,snpOffsets,line); + } while ( line != null ); + + java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these + // are sorted in standard reference order + + return seqVars; + + } catch ( FileNotFoundException e ) { + throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); + } catch ( IOException e ) { + throw new StingException("Error reading file "+file.getAbsolutePath()+".",e); + } + } + + private void incorporateInfo(List vars, List offsets, String seqLine) { + String[] genotypes = seqLine.split("\t"); + String individualName = genotypes[1]; + + int snpNumber = 0; + for ( int i : offsets ) { + vars.get(snpNumber).addGenotypeEntry(genotypes[i], individualName); + snpNumber++; + } + } + + private ArrayList instantiateVariantListFromHeader(String header) { + ArrayList seqVars = new ArrayList(); + String[] headerFields = header.split("\t"); + + for ( String field : headerFields ) { + if ( ! headerEntries.contains(field) ) { + // not a standard header, so a variant + seqVars.add(new SequenomVariantInfo(field)); + } + } + + return seqVars; + } + + private ArrayList getSNPOffsetsFromHeader(String header) { + ArrayList offsets = new ArrayList(); + String[] headerFields = header.split("\t"); + + int offset = 0; + for ( String field : headerFields ) { + if ( ! headerEntries.contains(field) ) { + offsets.add(offset); + } + offset++; + } + + return offsets; + } +} + +class SequenomVariantInfo implements Comparable { + private String variantName; + private GenomeLoc loc; + private ArrayList genotypes; + private ArrayList sampleNames; + + public GenomeLoc getLocation() { + return loc; + } + + public String getName() { + return variantName; + } + + public ArrayList getSampleNames() { + return sampleNames; + } + + public ArrayList getGenotypes() { + return genotypes; + } + + // CONSTRUCTOR + + public SequenomVariantInfo(String variantName) { + this.variantName = variantName; + this.parseNameToLoc(); + } + + private void parseNameToLoc() { + String chrom = this.variantName.split("_c")[1].split("_")[0]; + String pos = this.variantName.split("_p")[1].split("_")[0]; + this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); + } + + public void addGenotypeEntry(String genotypeString, String sampleName) { + String[] alleleStrs = genotypeString.split(" "); + ArrayList alleles = new ArrayList(2); // most, if not all, will be bi-allelic + for ( String alStr : alleleStrs ) { + Allele.AlleleType type = alStr.indexOf("-") > -1 ? Allele.AlleleType.DELETION : alStr.length() > 1 ? Allele.AlleleType.INSERTION : Allele.AlleleType.SNP; + alleles.add(new Allele(type,alStr)); + } + + this.genotypes.add( new Genotype(alleles, sampleName, 20.0)); + this.sampleNames.add(sampleName); + } + + public int compareTo(Object obj) { + if ( ! ( obj instanceof SequenomVariantInfo ) ) { + return 1; + } + + return loc.compareTo(((SequenomVariantInfo) obj).getLocation()); + } +} \ No newline at end of file