From 732fed9aadfcd1174b2ee7b6c43e03adb9f1639d Mon Sep 17 00:00:00 2001 From: asivache Date: Fri, 22 May 2009 17:17:59 +0000 Subject: [PATCH] ALERT, ALERT! rodSAMPileup is now a GenotypeList, not a Genotype! Now it can intelligently read full samtools pileup files (containing, in general, both point and indel genotypes at the same position). No need to split/synchronize pileups from different individuals anymore, hooray! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@797 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/rodSAMPileup.java | 586 ++++++------------ 1 file changed, 195 insertions(+), 391 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java index 1249f6879..52ac3b2e8 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java @@ -1,13 +1,14 @@ package org.broadinstitute.sting.gatk.refdata; +import java.io.IOException; import java.util.*; -import java.util.regex.Pattern; -import java.util.regex.Matcher; +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.Pileup; +import org.broadinstitute.sting.utils.StingException; + +import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker; /** * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
@@ -18,404 +19,207 @@ import org.broadinstitute.sting.utils.Pileup; * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
*
* for indel:
- * [chr] [pos] [always *] [alleles] [?] [?] [?] [num reads in the pile] ...
- * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
+ * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
+ * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
* User: asivache * Date: Apr 03, 2009 * Time: 2:58:33 PM * To change this template use File | Settings | File Templates. */ -public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype, Pileup { - private static final int NO_VARIANT = -1; - private static final int SNP_VARIANT = 0; - private static final int INSERTION_VARIANT = 1; - private static final int DELETION_VARIANT = 2; - private static final int INDEL_VARIANT = 3; - - // allocate once and don't ever bother creating them again: - private static final String baseA = new String("A"); - private static final String baseC = new String("C"); - private static final String baseG = new String("G"); - private static final String baseT = new String("T"); - private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions - - protected GenomeLoc loc; // genomic location of this genotyped site - // Reference sequence chromosome or scaffold - // Start and stop positions in chrom - protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) - protected String refBases; // the reference base sequence according to NCBI - protected String observedString; // store the actual string representation of observed alleles - - protected String pileupQuals; // the read base qualities - protected String pileupBases; // the read bases themselves - - protected List observedAlleles = null; // The sequences of the observed alleles from rs-fasta files - protected int varType = NO_VARIANT; - protected int ploidy = 2; // how many allelic variants we observe? - protected int nNonref = 0; // number of non-reference alleles - protected int eventLength = 0; - - protected double consensusScore; - protected double variantScore; - // ---------------------------------------------------------------------- - // - // Constructors - // - // ---------------------------------------------------------------------- - public rodSAMPileup(final String name) { - super(name); - } - - @Override - public boolean parseLine(final Object header, final String[] parts) { -// 0 1 2 3 4 5 6 7 -// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) -// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - try { - - String contig = parts[0]; - long start = Long.parseLong(parts[1]) ; - - consensusScore = Double.parseDouble(parts[4]); - variantScore = Double.parseDouble(parts[5]); - - refBaseChar = Character.toUpperCase(parts[2].charAt(0)); - - parts[3] = parts[3].toUpperCase(); - observedString = parts[3]; - - parseBasesAndQuals(parts[8], parts[9]); - - observedAlleles = new ArrayList(2); - - if ( refBaseChar == '*' ) { - parseIndels(parts[3]) ; - if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1); - else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! - } - else { - // if the variant is a SNP or a reference base (i.e. no variant at all) - assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter"; - refBases = parts[2].toUpperCase(); - eventLength = 1; - //loc = new GenomeLoc(contig, start, start+1); - loc = new GenomeLoc(contig, start, start); - - char ch = parts[3].charAt(0); - - switch ( ch ) { - case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; - case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; - case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; - case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; - case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; - case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; - case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; - case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; - case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; - case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; - } - if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - varType = SNP_VARIANT; - if ( observedAlleles.get(0).charAt(0) == refBaseChar || - observedAlleles.get(1).charAt(0) == refBaseChar || - observedAlleles.get(0) == observedAlleles.get(1) - ) nNonref = 1; - else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } - } - } catch ( RuntimeException e ) { - System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts)); - throw e; - } - return true; - // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); - } - - - private void parseIndels(String genotype) { - String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit - - // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; - // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases - // from what we have recorded for the deletion allele (e.g. "-CAC") - boolean hasRefAllele = false; - - for ( int i = 0 ; i < obs.length ; i++ ) { - if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { - hasRefAllele = true; - continue; // we will fill reference allele later - } - - String varBases = obs[i].toUpperCase(); - - switch ( obs[i].charAt(0) ) { - case '+': - if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT; - else varType = INSERTION_VARIANT; - refBases = emptyStr; - break; - case '-' : - if ( varType != NO_VARIANT && varType != DELETION_VARIANT ) varType = INDEL_VARIANT; - else varType = DELETION_VARIANT; - refBases = varBases; // remember what was deleted, this will be saved as "reference allele" - break; - default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); - } - observedAlleles.add(varBases); - eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!! - } - if ( hasRefAllele ) { - // we got at least one ref. allele (out of two recorded) - if ( varType == NO_VARIANT ) { // both top theories are actually ref allele; - observedAlleles.add(emptyStr); - observedAlleles.add(emptyStr); // no inserted/deleted bases at the site! - nNonref = 0; // no observations of non-reference allele at all - } else { - nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left - - // whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty; - // the one that contain actual bases (alt for insertion or ref for deletion) was already filled above: - observedAlleles.add(emptyStr); - } - } else { - // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we - // fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions - // or both deletions as compared to +ACC/-ACC which would still have the same bases - if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) nNonref = 1; - else nNonref = 2; - } - // DONE with indels - - } - - private void parseBasesAndQuals(final String bases, final String quals) - { - //System.out.printf("%s%n%s%n", bases, quals); - - // needs to convert the base string with it's . and , to the ref base - StringBuilder baseBuilder = new StringBuilder(); - StringBuilder qualBuilder = new StringBuilder(); - boolean done = false; - for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { - //System.out.printf("%d %d%n", i, j); - char c = (char)bases.charAt(i); - - switch ( c ) { - case '.': // matches reference - case ',': // matches reference - baseBuilder.append(refBaseChar); - qualBuilder.append((char)quals.charAt(j++)); - break; - case '$': // end of read - break; - case '*': // end of indel? - j++; - break; - case '^': // mapping quality - i++; - break; - case '+': // start of indel - case '-': // start of indel - final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 - final String rest = bases.substring(i+1); - //System.out.printf("sub is %s%n", rest); - Matcher match = regex.matcher(rest); - if ( ! match.matches() ) { - if ( refBaseChar != '*' ) - throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); - done = true; - } - else { - String g = match.group(1); - //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); - int l = Integer.parseInt(g); - i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) - //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); - } - break; - default: // non reference base - baseBuilder.append(c); - qualBuilder.append((char)quals.charAt(j++)); - } - } - - pileupBases = baseBuilder.toString(); - pileupQuals = qualBuilder.toString(); - } - - public GenomeLoc getLocation() { return loc; } - public String getQuals() { return pileupQuals; } - public char getRef() { return refBaseChar; } - public int size() { return pileupQuals.length(); } - public String getBases() { return pileupBases; } - - public String getPileupString() - { - return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals()); - } - - - /** Returns bases in the reference allele as a String. String can be empty (as in insertion into - * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters - * (for longer indels). - * - * @return reference allele, forward strand - */ - public String getRefBasesFWD() { - return refBases; - } - - - /** - * Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException - * if variant is not a SNP. - * - * @return reference base on the forward strand - */ - /* public char getRefSnpFWD() throws IllegalStateException { - if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP"); - return refBaseChar; - } -*/ - - @Override - public List getFWDAlleles() { - return observedAlleles; - } - - // ---------------------------------------------------------------------- - // - // What kind of variant are we? - // - // ---------------------------------------------------------------------- - public boolean isSNP() { return varType == SNP_VARIANT ; } - public boolean isInsertion() { return varType == INSERTION_VARIANT; } - public boolean isDeletion() { return varType == DELETION_VARIANT ; } - public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; } - public boolean isReference() { return varType == NO_VARIANT; } - - public boolean isHom() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) ); - return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); - } - - public boolean isHet() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) ); - return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); - } - - // ---------------------------------------------------------------------- - // - // formatting - // - // ---------------------------------------------------------------------- - public String toString() { - return String.format("%s\t%d\t%d\t%s\t%s\t%s", - getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString); - } - - public String toSimpleString() { - return String.format("%s:%s", name, observedString); - } - - public String toMediumString() { - - String s = null; - if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString); - else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1)); - if ( isSNP() ) s += ": SNP"; - else { - if ( isInsertion() ) s += ": Insertion"; - else { - if ( isDeletion() ) s+= ": Deletion"; - else { - if ( isIndel() ) s+=": Indel"; - else s+=": Reference"; - } - } - } - return s; - } - - public String repl() { - return String.format("REPL not implemented yet"); - } - -/* - @Override - public String getAltBasesFWD() { - if ( ! isSNP() && ! isIndel() ) return emptyStr; - - if ( isSNP() ) { - if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1); - else return observedAlleles.get(0); - } - - if ( isInsertion() ) { - if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1); - else return observedAlleles.get(0); - } - - if ( isDeletion() ) { - if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0); - else return observedAlleles.get(1); - - } - - System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString()); - return null; - } -*/ - - /* - @Override - public char getAltSnpFWD() throws IllegalStateException { - if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP"); - if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0); - else return observedAlleles.get(0).charAt(0); - - } - - @Override - public double getConsensusConfidence() { - return consensusScore; - } -*/ - - /* - @Override - public int getPloidy() throws IllegalStateException { - return 2; // ??? - } -*/ - - @Override - public double getVariantConfidence() { - return variantScore; - } - - - @Override - public boolean isBiallelic() { - return nNonref < 2; - } +public class rodSAMPileup extends BasicReferenceOrderedDatum implements GenotypeList { + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- + + private SAMPileupRecord indelGenotype = null; + private SAMPileupRecord pointGenotype = null; + + public rodSAMPileup(final String name) { + super(name); + } + @Override - public double getConsensusConfidence() { - return consensusScore; + public GenomeLoc getLocation() { + if ( pointGenotype != null ) return pointGenotype.getLocation(); + if ( indelGenotype != null ) return indelGenotype.getLocation(); + return null; + } + + /** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator, + * so this method does nothing at all (always returns false). + * + */ + @Override + public boolean parseLine(Object header, String[] parts) throws IOException { + return false; } @Override - public String getFWDRefBases() { - return refBases; + public String toString() { + StringBuilder b = new StringBuilder(); + if ( pointGenotype != null ) { + b.append(pointGenotype.toString()); + if ( indelGenotype != null ) b.append('\n'); + } + if ( indelGenotype != null ) b.append(indelGenotype.toString()); + return b.toString(); } + + @Override + public Genotype getIndelGenotype() { + return indelGenotype; + } + + @Override + public Genotype getPointGenotype() { + return pointGenotype; + } + + @Override + public boolean hasIndelGenotype() { + return indelGenotype != null; + } + + @Override + public boolean hasPointGenotype() { + return pointGenotype != null; + } + + /** Iterates over SAM pileup multiline records: each new invocation of next() will + * move to next genomic position in the pileup file, and if the record for that position + * consists of multiple lines (as is the case for indels), they all will be read and processed. + * @author asivache + * + */ + private static class rodSAMPileupIterator implements Iterator { + //private xReadLines parser = null; + private String rodName = null; + private PushbackIterator parser = null; + + rodSAMPileupIterator(String name, java.io.File f) { + parser = new PushbackIterator(SAMPileupRecord.createIterator(name,f)); + rodName = name; + } + + @Override + public boolean hasNext() { + return parser.hasNext(); + } + + @Override + public rodSAMPileup next() { + + rodSAMPileup result = new rodSAMPileup(rodName); + + SAMPileupRecord r = parser.next(); + //System.out.printf("Line is %s%n", line); + + if ( r.isPointGenotype() ) result.pointGenotype = r; + else result.indelGenotype = r; + + if ( parser.hasNext() ) { + + SAMPileupRecord r2 = parser.next(); + int cmp = r.getLocation().compareTo(r2.getLocation()); + switch ( cmp ) { + case -1 : // next record is at greater position + parser.pushback(r2); // return next record back to the stream + break; + case 0: // next record is at the same position; in this case r MUST be point and r2 MUST be indel + if ( result.indelGenotype != null ) { // oops, we've seen indel already + if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: point genotype follows indel genotype line at "+r.getLocation() ); + else throw new StingException("SAM pileup file might be corrupted: two indel genotype lines found at same position "+r.getLocation() ); + } else { + // ok, the first genotype we've seen was a point one + if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: two point genotype lines found at same position "+r.getLocation() ); + // and the second is an indel, wheww... + result.indelGenotype = r2; + } + + // the following is solely for the purposes of strict validation of pileup file: since we've already read two lines at the + // the same genomic location (point and indel variants), there can not be any more: + if ( parser.hasNext() ) { + r2 = parser.next(); + cmp = r.getLocation().compareTo(r2.getLocation() ) ; + if ( cmp == 0 ) throw new StingException("SAM pileup file is corrupted: more than two lines found at position "+r.getLocation()); + if ( cmp > 0 ) throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + + r.getLocation() ); + parser.pushback(r2); + } + + break; + case 1: // next record is at lower position?? come'on! + throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + r.getLocation() ); + default: + throw new StingException("INTERNAL ERROR: this point should never be reached"); + } + } + return result ; + } + + + @Override + public void remove() { + throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups"); + } + + } + + public static Iterator createIterator(String name, java.io.File file) { + return new rodSAMPileup.rodSAMPileupIterator(name,file); + } + + public static void main(String argv[]) { + String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup"; +// String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel"; + + Iterator it = createIterator("test-normal", new java.io.File(testFile)); + + ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker( + new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") +// new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta") + ); + + if ( reference.getSequenceDictionary() == null ) { + System.out.println("No reference sequence dictionary found. Abort."); + System.exit(1); + } + + GenomeLoc.setupRefContigOrdering(reference.getSequenceDictionary()); + + int counter = 0; + + while ( it.hasNext() && counter < 430 ) { + rodSAMPileup p = it.next(); + System.out.println(p.toString()); +/* + System.out.print(p.getLocation().toString()); + System.out.print('\t'); + + if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); } + else { + if ( p.isSNP() ) { System.out.print("SNP"); } + else { + if ( p.isIndel() ) { System.out.print("Indel"); } + else { System.out.print("REF"); } + } + } + + System.out.print('\t'); + System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1)); + System.out.print('\t'); + System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence()); + */ + counter++; + } + } + + } + +