diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java deleted file mode 100644 index 1b90b55bf..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java +++ /dev/null @@ -1,693 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - - -import java.io.FileNotFoundException; -import java.util.*; -import java.util.regex.Pattern; -import java.util.regex.Matcher; - -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.text.XReadLines; - -import net.sf.picard.reference.ReferenceSequenceFileWalker; -import net.sf.samtools.util.StringUtil; - -/** - * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
- * - * Example format:
- * for SNP:
- * [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
- * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
- *
- * for indel:
- * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
- * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
- * User: asivache - * Date: Apr 03, 2009 - * Time: 2:58:33 PM - * To change this template use File | Settings | File Templates. - */ - - -public class SAMPileupRecord { - private static final int NO_VARIANT = -1; - private static final int SNP_VARIANT = 0; - private static final int INSERTION_VARIANT = 1; - private static final int DELETION_VARIANT = 2; - private static final int INDEL_VARIANT = 3; - - // allocate once and don't ever bother creating them again: - private static final String baseA = new String("A"); - private static final String baseC = new String("C"); - private static final String baseG = new String("G"); - private static final String baseT = new String("T"); - private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions - - protected GenomeLoc loc; // genomic location of this genotyped site - // Reference sequence chromosome or scaffold - // Start and stop positions in chrom - - protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) - protected String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions - protected String observedString; // stores the actual string representation of observed alleles (for point mutations stores as A/C, not in extended alphabet!!) - - protected String pileupQuals; // the read base qualities - protected String pileupBases; // the read bases themselves - - protected List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion - protected int varType = NO_VARIANT; - protected int ploidy = 2; // how many allelic variants we have? - protected int nNonref = 0; // number of non-reference alleles observed - protected int eventLength = 0; // number of inserted or deleted bases - - protected double consensusScore = 0; - protected double variantScore = 0; - - private char fldDelim = '\t'; - private String name = emptyStr; - // ---------------------------------------------------------------------- - // - // Constructors - // - // ---------------------------------------------------------------------- - public SAMPileupRecord(String name) { - this.name = name; // keeps track name - } - - public boolean parseLine(final String line) { -// 0 1 2 3 4 5 6 7 -//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) -//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - try { - - int[] pos = Utils.indexOfAll(line, fldDelim ); - String contig = line.substring(0,pos[0] ); // field 0 - long start = Long.parseLong(line.substring(pos[0]+1, pos[1])) ; // field 1 - - refBaseChar = Character.toUpperCase(line.charAt(pos[1]+1)); // field 2 ( single char) - - observedString = line.substring(pos[2]+1, pos[3]).toUpperCase(); // field 3 - observedAlleles = new ArrayList(2); - - consensusScore = Double.parseDouble(line.substring(pos[3]+1,pos[4])); - variantScore = Double.parseDouble(line.substring(pos[4]+1, pos[5])); - - if ( refBaseChar == '*' ) { - - parseIndels(observedString) ; - if ( varType == DELETION_VARIANT ) loc = GenomeLocParser.createGenomeLoc(contig, start, start+eventLength-1); - else loc = GenomeLocParser.createGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! - } else { - parseBasesAndQuals(line,pos[7]+1,pos[8], pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) ); -// parseBasesAndQuals(line.substring(pos[7]+1,pos[8]), line.substring(pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) ) ); - // if the variant is a SNP or a reference base (i.e. no variant at all) - if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter"); - - refBases = line.substring(pos[1]+1, pos[2]).toUpperCase(); - eventLength = 1; - //loc = new GenomeLoc(contig, start, start+1); - loc = GenomeLocParser.createGenomeLoc(contig, start, start); - - char ch = observedString.charAt(0); - - switch ( ch ) { - case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; - case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; - case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; - case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; - case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; - case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; - case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; - case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; - case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; - case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; - } - if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - varType = SNP_VARIANT; - if ( observedAlleles.get(0).charAt(0) == refBaseChar || - observedAlleles.get(1).charAt(0) == refBaseChar || - observedAlleles.get(0) == observedAlleles.get(1) - ) nNonref = 1; - else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } - } - } catch ( RuntimeException e ) { - System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", line); - throw e; - } - return true; - // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); - } - - - /** Parses a line from SAM pileup file into this SAMPileupRecord object.. - * @param parts line from SAM pileup file, split on delimiter character (tab) - */ - public boolean parseLine(final String[] parts) { -// 0 1 2 3 4 5 6 7 -// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) -// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - try { - - String contig = parts[0]; - long start = Long.parseLong(parts[1]) ; - - refBaseChar = Character.toUpperCase(parts[2].charAt(0)); - - parts[3] = parts[3].toUpperCase(); - observedString = parts[3]; - observedAlleles = new ArrayList(2); - - consensusScore = Double.parseDouble(parts[4]); - variantScore = Double.parseDouble(parts[5]); - - if ( refBaseChar == '*' ) { - - parseIndels(parts[3]) ; - if ( varType == DELETION_VARIANT ) loc = GenomeLocParser.parseGenomeLoc(contig, start, start+eventLength-1); - else loc = GenomeLocParser.parseGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! - } - else { - parseBasesAndQuals(parts[8], parts[9]); - // if the variant is a SNP or a reference base (i.e. no variant at all) - if ( parts[3].length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter"); - - refBases = parts[2].toUpperCase(); - eventLength = 1; - //loc = GenomeLoc.parseGenomeLoc(contig, start, start+1); - loc = GenomeLocParser.parseGenomeLoc(contig, start, start); - - char ch = parts[3].charAt(0); - - switch ( ch ) { - case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; - case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; - case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; - case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; - case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; - case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; - case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; - case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; - case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; - case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; - } - if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - varType = SNP_VARIANT; - if ( observedAlleles.get(0).charAt(0) == refBaseChar || - observedAlleles.get(1).charAt(0) == refBaseChar || - observedAlleles.get(0) == observedAlleles.get(1) - ) nNonref = 1; - else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } - } - } catch ( RuntimeException e ) { - System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts)); - throw e; - } - return true; - // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); - } - - - private void parseIndels(String genotype) { - String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit - - // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; - // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases - // from what we have recorded for the deletion allele (e.g. "-CAC") - boolean hasRefAllele = false; - - for ( int i = 0 ; i < obs.length ; i++ ) { - if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { - hasRefAllele = true; - observedAlleles.add(emptyStr); - continue; - } - - String varBases = obs[i].toUpperCase(); - - switch ( obs[i].charAt(0) ) { - case '+': - if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT; - else varType = INSERTION_VARIANT; - refBases = emptyStr; - break; - case '-' : - if ( varType != NO_VARIANT && varType != DELETION_VARIANT ) varType = INDEL_VARIANT; - else varType = DELETION_VARIANT; - refBases = varBases; // remember what was deleted, this will be saved as "reference allele" - break; - default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); - } - observedAlleles.add(varBases); - eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!! - } - if ( hasRefAllele ) { - // we got at least one ref. allele (out of two recorded) - if ( varType == NO_VARIANT ) { // both top theories are actually ref allele; - nNonref = 0; // no observations of non-reference allele at all - refBases = emptyStr; - } else { - nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left - } - } else { - // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we - // fail to set data in a consistent way. - if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) nNonref = 1; - else nNonref = 2; - } - // DONE with indels - - } - - private void parseBasesAndQuals(final String bases, final String quals) - { - //System.out.printf("%s%n%s%n", bases, quals); - - // needs to convert the base string with it's . and , to the ref base - StringBuilder baseBuilder = new StringBuilder(); - StringBuilder qualBuilder = new StringBuilder(); - boolean done = false; - for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { - //System.out.printf("%d %d%n", i, j); - char c = (char)bases.charAt(i); - - switch ( c ) { - case '.': // matches reference - case ',': // matches reference - baseBuilder.append(refBaseChar); - qualBuilder.append((char)quals.charAt(j++)); - break; - case '$': // end of read - break; - case '*': // end of indel? - j++; - break; - case '^': // mapping quality - i++; - break; - case '+': // start of indel - case '-': // start of indel - final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 - final String rest = bases.substring(i+1); - //System.out.printf("sub is %s%n", rest); - Matcher match = regex.matcher(rest); - if ( ! match.matches() ) { - if ( refBaseChar != '*' ) - throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); - done = true; - } - else { - String g = match.group(1); - //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); - int l = Integer.parseInt(g); - i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) - //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); - } - break; - default: // non reference base - baseBuilder.append(c); - qualBuilder.append((char)quals.charAt(j++)); - } - } - - pileupBases = baseBuilder.toString(); - pileupQuals = qualBuilder.toString(); - } - - private void parseBasesAndQuals(final String line, int basesStart, int basesStop, int qualsStart, int qualsStop) - { - //System.out.printf("%s%n%s%n", bases, quals); - - // needs to convert the base string with it's . and , to the ref base - StringBuilder baseBuilder = new StringBuilder(); - StringBuilder qualBuilder = new StringBuilder(); - boolean done = false; - for ( int i = basesStart, j = qualsStart; i < basesStop && ! done; i++ ) { - //System.out.printf("%d %d%n", i, j); - char c = (char)line.charAt(i); - - switch ( c ) { - case '.': // matches reference - case ',': // matches reference - baseBuilder.append(refBaseChar); - qualBuilder.append((char)line.charAt(j++)); - break; - case '$': // end of read - break; - case '*': // end of indel? - j++; - break; - case '^': // mapping quality - i++; - break; - case '+': // start of indel - case '-': // start of indel - final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 - final String rest = line.substring(i+1,basesStop); - //System.out.printf("sub is %s%n", rest); - Matcher match = regex.matcher(rest); - if ( ! match.matches() ) { - if ( refBaseChar != '*' ) - throw new RuntimeException("Bad pileup format: " + line.substring(basesStart, basesStop) + " at position " + (i-basesStart)); - done = true; - } - else { - String g = match.group(1); - //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); - int l = Integer.parseInt(g); - i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) - //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); - } - break; - default: // non reference base - baseBuilder.append(c); - qualBuilder.append((char)line.charAt(j++)); - } - } - - pileupBases = baseBuilder.toString(); - pileupQuals = qualBuilder.toString(); - } - - - public GenomeLoc getLocation() { return loc; } - public String getQualsAsString() { return pileupQuals; } - - /** Returns reference base for point genotypes or '*' for indel genotypes, as a char. - * - */ - public char getRef() { return refBaseChar; } - public int size() { return pileupQuals.length(); } - - /** Returns pile of observed bases over the current genomic location. - * - */ - public String getBasesAsString() { return pileupBases; } - - /** Returns formatted pileup string for the current genomic location as - * "location: reference_base observed_base_pile observed_qual_pile" - */ - public String getPileupString() - { - return String.format("%s: %s %s %s", getLocation(), getRef(), getBasesAsString(), getQualsAsString()); - } - - public ArrayList getBasesAsArrayList() { throw new StingException("Not implemented"); } - public ArrayList getQualsAsArrayList() { throw new StingException("Not implemented"); } - - /** - * Gets the bases in byte array form. - * @return byte array of the available bases. - */ - public byte[] getBases() { - return StringUtil.stringToBytes(getBasesAsString()); - } - - /** - * Gets the Phred base qualities without ASCII offset. - * @return Phred base qualities. - */ - public byte[] getQuals() { - byte[] quals = StringUtil.stringToBytes(getQualsAsString()); - for(int i = 0; i < quals.length; i++) quals[i] -= 33; - return quals; - } - - /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single - * character (reference base). For indel genotypes, the string is empty for insertions into - * the reference, or consists of deleted bases for deletions. - * - * @return reference allele, forward strand - */ - public String getFWDRefBases() { - return refBases; - } - - - public List getFWDAlleles() { - return observedAlleles; - } - - // ---------------------------------------------------------------------- - // - // What kind of variant are we? - // - // ---------------------------------------------------------------------- - public boolean isSNP() { return varType == SNP_VARIANT ; } - public boolean isInsertion() { return varType == INSERTION_VARIANT; } - public boolean isDeletion() { return varType == DELETION_VARIANT ; } - public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; } - public boolean isReference() { return varType == NO_VARIANT; } - - public boolean isHom() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) ); - return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); - } - - public boolean isHet() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) ); - return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); - } - - // ---------------------------------------------------------------------- - // - // formatting - // - // ---------------------------------------------------------------------- - public String toString() { - return String.format("%s\t%d\t%d\t%s\t%s\t%s", - getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString); - } - - public String toSimpleString() { - return String.format("%s:%s", name, observedString); - } - - public String toMediumString() { - - String s = null; - if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString); - else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1)); - if ( isSNP() ) s += ": SNP"; - else { - if ( isInsertion() ) s += ": Insertion"; - else { - if ( isDeletion() ) s+= ": Deletion"; - else { - if ( isIndel() ) s+=": Indel"; - else s+=": Reference"; - } - } - } - return s; - } - - public String repl() { - return String.format("REPL not implemented yet"); - } - - - public double getVariantConfidence() { - return variantScore; - } - - - public boolean isBiallelic() { - return nNonref < 2; - } - - public double getConsensusConfidence() { - return consensusScore; - } - - public int length() { - return eventLength; - } - - public boolean isIndelGenotype() { - return refBaseChar == '*'; - } - - - public boolean isPointGenotype() { - return ! isIndelGenotype(); - } - - /** Implements method required by GenotypeList interface. If this object represents - * an indel genotype, then it returns itself through this method. If this object is a - * point genotype, this method returns null. - * @return - */ - public SAMPileupRecord getIndelGenotype() { - if ( isIndelGenotype() ) return this; - else return null; - } - - /** Implements method required by GenotypeList interface. If this object represents - * a point genotype, then it returns itself through this method. If this object is an - * indel genotype, this method returns null. - * @return - */ - public SAMPileupRecord getPointGenotype() { - if ( isPointGenotype() ) return this; - else return null; - } - - /** Returns true if this object \em is an indel genotype (and thus - * indel genotype is what it only has). - * @return - */ - public boolean hasIndelGenotype() { - return isIndelGenotype(); - } - - /** Returns true if this object \em is a point genotype (and thus - * point genotype is what it only has. - * @return - */ - public boolean hasPointGenotype() { - return isPointGenotype(); - } - - public int compareTo(ReferenceOrderedDatum o) { - return getLocation().compareTo(o.getLocation()); - } - - protected static class pileupFileIterator implements Iterator { -// private String fieldDelimiter = new String("\t"); - private String rodName = null; - private XReadLines parser = null; - private String lastProcessedLine = null; - -// private long z = 0; -// private long t = 0; - - pileupFileIterator(String name, java.io.File f) { - try { - parser = new XReadLines(f); - } catch ( FileNotFoundException e ) { - Utils.scareUser("Couldn't open file: " + f); - } - rodName = name; - } - - public boolean hasNext() { - return parser.hasNext(); - } - - /** - * @return the next element in the iteration. - * @throws NoSuchElementException - iterator has no more elements. - */ - public SAMPileupRecord next() { - if (!this.hasNext()) throw new NoSuchElementException("SAMPileupRecord next called on iterator with no more elements"); -// if ( z == 0 ) t = System.currentTimeMillis(); - lastProcessedLine = parser.next(); - SAMPileupRecord n = new SAMPileupRecord(rodName); - //System.out.printf("Line is %s%n", line); -// String parts[] = lastProcessedLine.split(fieldDelimiter); -// n.parseLine(parts); - - n.parseLine(lastProcessedLine); -// if ( ++z == 1000000 ) { System.out.println(rodName + ": 1,000,000 records read in "+((double)(System.currentTimeMillis()-t)/1000.0) +" s"); z = 0; } - return n ; - } - - - public void remove() { - throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups"); - } - - } - - - public static Iterator createIterator(String name, java.io.File f) { - return new pileupFileIterator(name,f); - } - - - - public static void main(String argv[]) { -// String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup"; - String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel"; - - Iterator it = createIterator("test-normal", new java.io.File(testFile)); - - ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker( - //new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta") - ); - - if ( reference.getSequenceDictionary() == null ) { - System.out.println("No reference sequence dictionary found. Abort."); - System.exit(1); - } - - GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary()); - - int counter = 0; - - while ( it.hasNext() && counter < 430 ) { - SAMPileupRecord p = it.next(); - - System.out.print(p.getLocation().toString()); - System.out.print('\t'); - - if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); } - else { - if ( p.isSNP() ) { System.out.print("SNP"); } - else { - if ( p.isIndel() ) { System.out.print("Indel"); } - else { System.out.print("REF"); } - } - } - - System.out.print('\t'); - System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1)); - System.out.print('\t'); - System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence()); - counter++; - } - } - - - - -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java new file mode 100644 index 000000000..b9278ad30 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.features.sampileup; + +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.Feature; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.ArrayList; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature.VariantType; + +/** + * A Tribble encoder / decoder for SAM pileup data. + * + * @author mhanna + * @version 0.1 + */ +public class SAMPileupCodec implements FeatureCodec { + // the number of tokens we expect to parse from a dbSNP line + private static final int expectedTokenCount = 10; + private static final char fldDelim = '\t'; + + // allocate once and don't ever bother creating them again: + private static final String baseA = "A"; + private static final String baseC = "C"; + private static final String baseG = "G"; + private static final String baseT = "T"; + private static final String emptyStr = ""; // we will use this for "reference" allele in insertions + + /** + * Return the # of header lines for this file. + * + * @param f the file + * @return 0 in this case, we assume no header lines. The DbSNP file may have a + * header line beginning with '#', but we can ignore that in the decode function. + */ + public int headerLineCount(File f) { + // we don't require a header line, but it may exist. We'll deal with that above. + return 0; + } + + public Feature decode(String line) { +// 0 1 2 3 4 5 6 7 +//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) +//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 + String[] tokens = new String[expectedTokenCount]; + + // split the line + int count = ParsingUtils.split(line,tokens,fldDelim); + + // check to see if we've parsed the string into the right number of tokens (expectedTokenCount) + if (count != expectedTokenCount) + throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " + + "(expected = " + expectedTokenCount + ", saw = " + count + " on " + + "line = " + line + ")"); + + SAMPileupFeature feature = new SAMPileupFeature(tokens[0], + 0, + 0); + + feature.setStart(Integer.parseInt(tokens[1])); + + if(tokens[2].length() != 1) + throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line); + feature.setRef(Character.toUpperCase(tokens[2].charAt(0))); + + String observedString = tokens[3].toUpperCase(); // field 3 + feature.setFWDAlleles(new ArrayList(2)); + + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + + if ( feature.getRef() == '*' ) { + parseIndels(observedString,feature) ; + if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); + else feature.setEnd(feature.getStart()-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! + } else { + parseBasesAndQuals(feature,tokens[8],tokens[9]); + // if the variant is a SNP or a reference base (i.e. no variant at all) + if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter"); + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + + char ch = observedString.charAt(0); + + switch ( ch ) { + case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; + case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; + case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; + case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; + case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; + case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; + case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; + case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; + case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; + case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; + } + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); + else { + // we know that at least one allele is non-ref; + // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. + // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) + feature.setVariantType(VariantType.SNP); + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) + ) feature.setNumNonRef(1); + else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... + } + } + + return feature; + } + + private void parseIndels(String genotype,SAMPileupFeature feature) { + String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit + + // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; + // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases + // from what we have recorded for the deletion allele (e.g. "-CAC") + boolean hasRefAllele = false; + + for ( int i = 0 ; i < obs.length ; i++ ) { + if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { + hasRefAllele = true; + feature.getFWDAlleles().add(emptyStr); + continue; + } + + String varBases = obs[i].toUpperCase(); + + switch ( obs[i].charAt(0) ) { + case '+': + if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL); + else feature.setVariantType(VariantType.INSERTION); + feature.setRefBases(emptyStr); + break; + case '-' : + if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL); + else feature.setVariantType(VariantType.DELETION); + feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele" + break; + default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); + } + feature.getFWDAlleles().add(varBases); + feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!! + } + if ( hasRefAllele ) { + // we got at least one ref. allele (out of two recorded) + if (feature.isReference()) { // both top theories are actually ref allele; + feature.setNumNonRef(0); // no observations of non-reference allele at all + feature.setRefBases(emptyStr); + } else { + feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left + } + } else { + // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we + // fail to set data in a consistent way. + if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1); + else feature.setNumNonRef(2); + } + // DONE with indels + + } + + private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals) + { + //System.out.printf("%s%n%s%n", bases, quals); + + // needs to convert the base string with it's . and , to the ref base + StringBuilder baseBuilder = new StringBuilder(); + StringBuilder qualBuilder = new StringBuilder(); + boolean done = false; + for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { + //System.out.printf("%d %d%n", i, j); + char c = (char)bases.charAt(i); + + switch ( c ) { + case '.': // matches reference + case ',': // matches reference + baseBuilder.append(feature.getRef()); + qualBuilder.append(quals.charAt(j++)); + break; + case '$': // end of read + break; + case '*': // end of indel? + j++; + break; + case '^': // mapping quality + i++; + break; + case '+': // start of indel + case '-': // start of indel + final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 + final String rest = bases.substring(i+1); + //System.out.printf("sub is %s%n", rest); + Matcher match = regex.matcher(rest); + if ( ! match.matches() ) { + if ( feature.getRef() != '*' ) + throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); + done = true; + } + else { + String g = match.group(1); + //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); + int l = Integer.parseInt(g); + i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) + //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); + } + break; + default: // non reference base + baseBuilder.append(c); + qualBuilder.append(quals.charAt(j++)); + } + } + + feature.setPileupBases(baseBuilder.toString()); + feature.setPileupQuals(qualBuilder.toString()); + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java new file mode 100644 index 000000000..6b7257913 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.features.sampileup; + +import org.broad.tribble.Feature; + +import java.util.List; + +import net.sf.samtools.util.StringUtil; + +/** + * A tribble feature representing a SAM pileup. + * + * @author mhanna + * @version 0.1 + */ +public class SAMPileupFeature implements Feature { + public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL }; + + private String contig; // genomic location of this genotyped site + private int start; + private int stop; + + private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) + private String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions + + private String pileupQuals; // the read base qualities + private String pileupBases; // the read bases themselves + + private List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion + private VariantType varType = VariantType.NONE; + private int nNonref = 0; // number of non-reference alleles observed + private int eventLength = 0; // number of inserted or deleted bases + + private double consensusScore = 0; + private double variantScore = 0; + + /** + * create the dbSNP feature, given the following information: + * + * @param contig the contig rsID + * @param start the start position, one based + * @param stop the stop position, one based + */ + SAMPileupFeature(String contig, + int start, + int stop) { + this.contig = contig; + this.start = start; + this.stop = stop; + } + + public String getChr() { + return contig; + } + + protected void setChr(String chr) { + this.contig = chr; + } + + public int getStart() { + return start; + } + + protected void setStart(int start) { + this.start = start; + } + + public int getEnd() { + return stop; + } + + protected void setEnd(int end) { + this.stop = end; + } + + public String getQualsAsString() { return pileupQuals; } + + protected void setPileupQuals(String pileupQuals) { + this.pileupQuals = pileupQuals; + } + + /** Returns reference base for point genotypes or '*' for indel genotypes, as a char. + * + */ + public char getRef() { return refBaseChar; } + + protected void setRef(char ref) { + this.refBaseChar = ref; + } + + public int size() { return pileupQuals.length(); } + + /** Returns pile of observed bases over the current genomic location. + * + */ + public String getBasesAsString() { return pileupBases; } + + protected void setPileupBases(String pileupBases) { + this.pileupBases = pileupBases; + } + + /** Returns formatted pileup string for the current genomic location as + * "location: reference_base observed_base_pile observed_qual_pile" + */ + public String getPileupString() + { + if(start == stop) + return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString()); + else + return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString()); + } + + /** + * Gets the bases in byte array form. + * @return byte array of the available bases. + */ + public byte[] getBases() { + return StringUtil.stringToBytes(getBasesAsString()); + } + + /** + * Gets the Phred base qualities without ASCII offset. + * @return Phred base qualities. + */ + public byte[] getQuals() { + byte[] quals = StringUtil.stringToBytes(getQualsAsString()); + for(int i = 0; i < quals.length; i++) quals[i] -= 33; + return quals; + } + + /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single + * character (reference base). For indel genotypes, the string is empty for insertions into + * the reference, or consists of deleted bases for deletions. + * + * @return reference allele, forward strand + */ + public String getFWDRefBases() { + return refBases; + } + + protected void setRefBases(String refBases) { + this.refBases = refBases; + } + + public List getFWDAlleles() { + return observedAlleles; + } + + protected void setFWDAlleles(List alleles) { + this.observedAlleles = alleles; + } + + // ---------------------------------------------------------------------- + // + // What kind of variant are we? + // + // ---------------------------------------------------------------------- + public boolean isSNP() { return varType == VariantType.SNP; } + public boolean isInsertion() { return varType == VariantType.INSERTION; } + public boolean isDeletion() { return varType == VariantType.DELETION ; } + public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; } + public boolean isReference() { return varType == VariantType.NONE; } + + protected void setVariantType(VariantType variantType) { + this.varType = variantType; + } + + public boolean isHom() { + // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype + if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) ); + return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); + } + + public boolean isHet() { + // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype + if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) ); + return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); + } + + public double getVariantConfidence() { + return variantScore; + } + + protected void setVariantConfidence(double variantScore) { + this.variantScore = variantScore; + } + + public boolean isBiallelic() { + return nNonref < 2; + } + + protected void setNumNonRef(int nNonref) { + this.nNonref = nNonref; + } + + public double getConsensusConfidence() { + return consensusScore; + } + + protected void setConsensusConfidence(double consensusScore) { + this.consensusScore = consensusScore; + } + + public int length() { + return eventLength; + } + + protected void setLength(int eventLength) { + this.eventLength = eventLength; + } + + public boolean isIndelGenotype() { + return refBaseChar == '*'; + } + + + public boolean isPointGenotype() { + return ! isIndelGenotype(); + } + + /** Implements method required by GenotypeList interface. If this object represents + * an indel genotype, then it returns itself through this method. If this object is a + * point genotype, this method returns null. + * @return + */ + public SAMPileupFeature getIndelGenotype() { + if ( isIndelGenotype() ) return this; + else return null; + } + + /** Implements method required by GenotypeList interface. If this object represents + * a point genotype, then it returns itself through this method. If this object is an + * indel genotype, this method returns null. + * @return + */ + public SAMPileupFeature getPointGenotype() { + if ( isPointGenotype() ) return this; + else return null; + } + + /** Returns true if this object \em is an indel genotype (and thus + * indel genotype is what it only has). + * @return + */ + public boolean hasIndelGenotype() { + return isIndelGenotype(); + } + + /** Returns true if this object \em is a point genotype (and thus + * point genotype is what it only has. + * @return + */ + public boolean hasPointGenotype() { + return isPointGenotype(); + } + + + +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java deleted file mode 100644 index 8ab9c7f88..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java +++ /dev/null @@ -1,225 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - - -import java.io.IOException; -import java.util.*; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import net.sf.picard.reference.ReferenceSequenceFileWalker; - -/** - * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
- * - * Example format:
- * for SNP:
- * [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
- * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
- *
- * for indel:
- * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
- * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
- * User: asivache - * Date: Apr 03, 2009 - * Time: 2:58:33 PM - * To change this template use File | Settings | File Templates. - */ - - - -public class rodSAMPileup extends BasicReferenceOrderedDatum { - // ---------------------------------------------------------------------- - // - // Constructors - // - // ---------------------------------------------------------------------- - - private SAMPileupRecord indelGenotype = null; - private SAMPileupRecord pointGenotype = null; - - public rodSAMPileup(final String name) { - super(name); - } - - @Override - public GenomeLoc getLocation() { - if ( pointGenotype != null ) return pointGenotype.getLocation(); - if ( indelGenotype != null ) return indelGenotype.getLocation(); - return null; - } - - /** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator, - * so this method does nothing at all (always returns false). - * - */ - @Override - public boolean parseLine(Object header, String[] parts) throws IOException { - return false; - } - - @Override - public String toString() { - StringBuilder b = new StringBuilder(); - if ( pointGenotype != null ) { - b.append(pointGenotype.toString()); - if ( indelGenotype != null ) b.append('\n'); - } - if ( indelGenotype != null ) b.append(indelGenotype.toString()); - return b.toString(); - } - - public SAMPileupRecord getIndelGenotype() { - return indelGenotype; - } - - public SAMPileupRecord getPointGenotype() { - return pointGenotype; - } - - public boolean hasIndelGenotype() { - return indelGenotype != null; - } - - public boolean hasPointGenotype() { - return pointGenotype != null; - } - - /** Iterates over SAM pileup multiline records: each new invocation of next() will - * move to next genomic position in the pileup file, and if the record for that position - * consists of multiple lines (as is the case for indels), they all will be read and processed. - * @author asivache - * - */ - private static class rodSAMPileupIterator implements Iterator { - //private xReadLines parser = null; - private String rodName = null; - private PushbackIterator parser = null; - private long line = 0; - - rodSAMPileupIterator(String name, java.io.File f) { - parser = new PushbackIterator(SAMPileupRecord.createIterator(name,f)); - rodName = name; - } - - public boolean hasNext() { - return parser.hasNext(); - } - - /** - * @return the next element in the iteration. - * @throws NoSuchElementException - iterator has no more elements. - */ - public rodSAMPileup next() { - if (!this.hasNext()) throw new NoSuchElementException("rodSAMPileup next called on iterator with no more elements"); - rodSAMPileup result = new rodSAMPileup(rodName); - - SAMPileupRecord r = parser.next(); - - // if ( (++line)%10000 == 0 ) System.out.printf("%s: record is %d (%s)%n", rodName, line,r.getLocation().toString()); - - if ( r.isPointGenotype() ) result.pointGenotype = r; - else result.indelGenotype = r; - - if ( parser.hasNext() ) { - - SAMPileupRecord r2 = parser.next(); - int cmp = r.getLocation().compareTo(r2.getLocation()); - switch ( cmp ) { - case -1 : // next record is at greater position - parser.pushback(r2); // return next record back to the stream - break; - case 0: // next record is at the same position; in this case r MUST be point and r2 MUST be indel - if ( result.indelGenotype != null ) { // oops, we've seen indel already - if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: point genotype follows indel genotype line at "+r.getLocation() ); - else throw new StingException("SAM pileup file might be corrupted: two indel genotype lines found at same position "+r.getLocation() ); - } else { - // ok, the first genotype we've seen was a point one - if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: two point genotype lines found at same position "+r.getLocation() ); - // and the second is an indel, wheww... - result.indelGenotype = r2; - } - - // the following is solely for the purposes of strict validation of pileup file: since we've already read two lines at the - // the same genomic location (point and indel variants), there can not be any more: - if ( parser.hasNext() ) { - r2 = parser.next(); - cmp = r.getLocation().compareTo(r2.getLocation() ) ; - if ( cmp == 0 ) throw new StingException("SAM pileup file is corrupted: more than two lines found at position "+r.getLocation()); - if ( cmp > 0 ) throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " - + r.getLocation() ); - parser.pushback(r2); - } - - break; - case 1: // next record is at lower position?? come'on! - throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + r.getLocation() ); - default: - throw new StingException("INTERNAL ERROR: this point should never be reached"); - } - } - return result ; - } - - - public void remove() { - throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups"); - } - - } - - public static Iterator createIterator(String name, java.io.File file) { - return new rodSAMPileup.rodSAMPileupIterator(name,file); - } - - public static void main(String argv[]) { - String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup"; -// String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel"; - - Iterator it = createIterator("test-normal", new java.io.File(testFile)); - - ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker( - new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") -// new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta") - ); - - if ( reference.getSequenceDictionary() == null ) { - System.out.println("No reference sequence dictionary found. Abort."); - System.exit(1); - } - - GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary()); - - int counter = 0; - - while ( it.hasNext() && counter < 430 ) { - rodSAMPileup p = it.next(); - System.out.println(p.toString()); -/* - System.out.print(p.getLocation().toString()); - System.out.print('\t'); - - if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); } - else { - if ( p.isSNP() ) { System.out.print("SNP"); } - else { - if ( p.isIndel() ) { System.out.print("Indel"); } - else { System.out.print("REF"); } - } - } - - System.out.print('\t'); - System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1)); - System.out.print('\t'); - System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence()); - */ - counter++; - } - } - - -} - - diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java index 9e9c898aa..0a6f35d16 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java @@ -54,7 +54,6 @@ public class RODTrackBuilder implements RMDTrackBuilder { static { // All known ROD types - Types.put("SAMPileup", rodSAMPileup.class); Types.put("GELI", rodGELI.class); Types.put("RefSeq", rodRefSeq.class); Types.put("Table", TabularROD.class); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java index 2a65a7d54..49d5ae586 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java @@ -28,11 +28,12 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SAMPileupRecord; -import org.broadinstitute.sting.gatk.refdata.rodSAMPileup; +import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -43,14 +44,14 @@ import java.util.Arrays; * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data * should be specified using the command-line argument '-B pileup,SAMPileup,'. */ -@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=rodSAMPileup.class)) +@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=SAMPileupFeature.class)) public class ValidatingPileupWalker extends LocusWalker implements TreeReducible { @Argument(fullName="continue_after_error",doc="Continue after an error",required=false) public boolean CONTINUE_AFTER_AN_ERROR = false; public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { ReadBackedPileup pileup = context.getBasePileup(); - SAMPileupRecord truePileup = getTruePileup( tracker ); + SAMPileupFeature truePileup = getTruePileup( tracker ); if ( truePileup == null ) { out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBase())); @@ -82,11 +83,12 @@ public class ValidatingPileupWalker extends LocusWalker