ALERT, ALERT! rodSAMPileup is now a GenotypeList, not a Genotype! Now it can intelligently read full samtools pileup files (containing, in general, both point and indel genotypes at the same position). No need to split/synchronize pileups from different individuals anymore, hooray!
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@797 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
26633957d9
commit
732fed9aad
|
|
@ -1,13 +1,14 @@
|
||||||
package org.broadinstitute.sting.gatk.refdata;
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.Pileup;
|
|
||||||
|
import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br>
|
* This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br>
|
||||||
|
|
@ -18,404 +19,207 @@ import org.broadinstitute.sting.utils.Pileup;
|
||||||
* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) <br>
|
* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) <br>
|
||||||
* <br>
|
* <br>
|
||||||
* for indel: <br>
|
* for indel: <br>
|
||||||
* [chr] [pos] [always *] [alleles] [?] [?] [?] [num reads in the pile] ... <br>
|
* [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]<br>
|
||||||
* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 <br>
|
* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 <br>
|
||||||
* User: asivache
|
* User: asivache
|
||||||
* Date: Apr 03, 2009
|
* Date: Apr 03, 2009
|
||||||
* Time: 2:58:33 PM
|
* Time: 2:58:33 PM
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype, Pileup {
|
|
||||||
private static final int NO_VARIANT = -1;
|
|
||||||
private static final int SNP_VARIANT = 0;
|
|
||||||
private static final int INSERTION_VARIANT = 1;
|
|
||||||
private static final int DELETION_VARIANT = 2;
|
|
||||||
private static final int INDEL_VARIANT = 3;
|
|
||||||
|
|
||||||
// allocate once and don't ever bother creating them again:
|
|
||||||
private static final String baseA = new String("A");
|
|
||||||
private static final String baseC = new String("C");
|
|
||||||
private static final String baseG = new String("G");
|
|
||||||
private static final String baseT = new String("T");
|
|
||||||
private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
|
|
||||||
|
|
||||||
protected GenomeLoc loc; // genomic location of this genotyped site
|
|
||||||
// Reference sequence chromosome or scaffold
|
|
||||||
// Start and stop positions in chrom
|
|
||||||
|
|
||||||
|
|
||||||
protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
|
|
||||||
protected String refBases; // the reference base sequence according to NCBI
|
|
||||||
protected String observedString; // store the actual string representation of observed alleles
|
|
||||||
|
|
||||||
protected String pileupQuals; // the read base qualities
|
public class rodSAMPileup extends BasicReferenceOrderedDatum implements GenotypeList {
|
||||||
protected String pileupBases; // the read bases themselves
|
|
||||||
|
|
||||||
protected List<String> observedAlleles = null; // The sequences of the observed alleles from rs-fasta files
|
|
||||||
protected int varType = NO_VARIANT;
|
|
||||||
protected int ploidy = 2; // how many allelic variants we observe?
|
|
||||||
protected int nNonref = 0; // number of non-reference alleles
|
|
||||||
protected int eventLength = 0;
|
|
||||||
|
|
||||||
protected double consensusScore;
|
|
||||||
protected double variantScore;
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Constructors
|
// Constructors
|
||||||
//
|
//
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
private SAMPileupRecord indelGenotype = null;
|
||||||
|
private SAMPileupRecord pointGenotype = null;
|
||||||
|
|
||||||
public rodSAMPileup(final String name) {
|
public rodSAMPileup(final String name) {
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean parseLine(final Object header, final String[] parts) {
|
public GenomeLoc getLocation() {
|
||||||
// 0 1 2 3 4 5 6 7
|
if ( pointGenotype != null ) return pointGenotype.getLocation();
|
||||||
// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
|
if ( indelGenotype != null ) return indelGenotype.getLocation();
|
||||||
// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
|
|
||||||
try {
|
|
||||||
|
|
||||||
String contig = parts[0];
|
|
||||||
long start = Long.parseLong(parts[1]) ;
|
|
||||||
|
|
||||||
consensusScore = Double.parseDouble(parts[4]);
|
|
||||||
variantScore = Double.parseDouble(parts[5]);
|
|
||||||
|
|
||||||
refBaseChar = Character.toUpperCase(parts[2].charAt(0));
|
|
||||||
|
|
||||||
parts[3] = parts[3].toUpperCase();
|
|
||||||
observedString = parts[3];
|
|
||||||
|
|
||||||
parseBasesAndQuals(parts[8], parts[9]);
|
|
||||||
|
|
||||||
observedAlleles = new ArrayList<String>(2);
|
|
||||||
|
|
||||||
if ( refBaseChar == '*' ) {
|
|
||||||
parseIndels(parts[3]) ;
|
|
||||||
if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1);
|
|
||||||
else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// if the variant is a SNP or a reference base (i.e. no variant at all)
|
|
||||||
assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter";
|
|
||||||
refBases = parts[2].toUpperCase();
|
|
||||||
eventLength = 1;
|
|
||||||
//loc = new GenomeLoc(contig, start, start+1);
|
|
||||||
loc = new GenomeLoc(contig, start, start);
|
|
||||||
|
|
||||||
char ch = parts[3].charAt(0);
|
|
||||||
|
|
||||||
switch ( ch ) {
|
|
||||||
case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
|
|
||||||
case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
|
|
||||||
case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
|
|
||||||
case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
|
|
||||||
case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
|
|
||||||
case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
|
|
||||||
case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
|
|
||||||
case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
|
|
||||||
case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
|
|
||||||
case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
|
|
||||||
}
|
|
||||||
if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
|
|
||||||
else {
|
|
||||||
// we know that at least one allele is non-ref;
|
|
||||||
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
|
|
||||||
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
|
|
||||||
varType = SNP_VARIANT;
|
|
||||||
if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
|
|
||||||
observedAlleles.get(1).charAt(0) == refBaseChar ||
|
|
||||||
observedAlleles.get(0) == observedAlleles.get(1)
|
|
||||||
) nNonref = 1;
|
|
||||||
else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch ( RuntimeException e ) {
|
|
||||||
System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
// if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void parseIndels(String genotype) {
|
|
||||||
String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
|
|
||||||
|
|
||||||
// if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
|
|
||||||
// if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
|
|
||||||
// from what we have recorded for the deletion allele (e.g. "-CAC")
|
|
||||||
boolean hasRefAllele = false;
|
|
||||||
|
|
||||||
for ( int i = 0 ; i < obs.length ; i++ ) {
|
|
||||||
if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
|
|
||||||
hasRefAllele = true;
|
|
||||||
continue; // we will fill reference allele later
|
|
||||||
}
|
|
||||||
|
|
||||||
String varBases = obs[i].toUpperCase();
|
|
||||||
|
|
||||||
switch ( obs[i].charAt(0) ) {
|
|
||||||
case '+':
|
|
||||||
if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT;
|
|
||||||
else varType = INSERTION_VARIANT;
|
|
||||||
refBases = emptyStr;
|
|
||||||
break;
|
|
||||||
case '-' :
|
|
||||||
if ( varType != NO_VARIANT && varType != DELETION_VARIANT ) varType = INDEL_VARIANT;
|
|
||||||
else varType = DELETION_VARIANT;
|
|
||||||
refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
|
|
||||||
break;
|
|
||||||
default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
|
|
||||||
}
|
|
||||||
observedAlleles.add(varBases);
|
|
||||||
eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
|
|
||||||
}
|
|
||||||
if ( hasRefAllele ) {
|
|
||||||
// we got at least one ref. allele (out of two recorded)
|
|
||||||
if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
|
|
||||||
observedAlleles.add(emptyStr);
|
|
||||||
observedAlleles.add(emptyStr); // no inserted/deleted bases at the site!
|
|
||||||
nNonref = 0; // no observations of non-reference allele at all
|
|
||||||
} else {
|
|
||||||
nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
|
|
||||||
|
|
||||||
// whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty;
|
|
||||||
// the one that contain actual bases (alt for insertion or ref for deletion) was already filled above:
|
|
||||||
observedAlleles.add(emptyStr);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
|
|
||||||
// fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions
|
|
||||||
// or both deletions as compared to +ACC/-ACC which would still have the same bases
|
|
||||||
if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) nNonref = 1;
|
|
||||||
else nNonref = 2;
|
|
||||||
}
|
|
||||||
// DONE with indels
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private void parseBasesAndQuals(final String bases, final String quals)
|
|
||||||
{
|
|
||||||
//System.out.printf("%s%n%s%n", bases, quals);
|
|
||||||
|
|
||||||
// needs to convert the base string with it's . and , to the ref base
|
|
||||||
StringBuilder baseBuilder = new StringBuilder();
|
|
||||||
StringBuilder qualBuilder = new StringBuilder();
|
|
||||||
boolean done = false;
|
|
||||||
for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
|
|
||||||
//System.out.printf("%d %d%n", i, j);
|
|
||||||
char c = (char)bases.charAt(i);
|
|
||||||
|
|
||||||
switch ( c ) {
|
|
||||||
case '.': // matches reference
|
|
||||||
case ',': // matches reference
|
|
||||||
baseBuilder.append(refBaseChar);
|
|
||||||
qualBuilder.append((char)quals.charAt(j++));
|
|
||||||
break;
|
|
||||||
case '$': // end of read
|
|
||||||
break;
|
|
||||||
case '*': // end of indel?
|
|
||||||
j++;
|
|
||||||
break;
|
|
||||||
case '^': // mapping quality
|
|
||||||
i++;
|
|
||||||
break;
|
|
||||||
case '+': // start of indel
|
|
||||||
case '-': // start of indel
|
|
||||||
final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
|
|
||||||
final String rest = bases.substring(i+1);
|
|
||||||
//System.out.printf("sub is %s%n", rest);
|
|
||||||
Matcher match = regex.matcher(rest);
|
|
||||||
if ( ! match.matches() ) {
|
|
||||||
if ( refBaseChar != '*' )
|
|
||||||
throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
|
|
||||||
done = true;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
String g = match.group(1);
|
|
||||||
//System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
|
|
||||||
int l = Integer.parseInt(g);
|
|
||||||
i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
|
|
||||||
//System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default: // non reference base
|
|
||||||
baseBuilder.append(c);
|
|
||||||
qualBuilder.append((char)quals.charAt(j++));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pileupBases = baseBuilder.toString();
|
|
||||||
pileupQuals = qualBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public GenomeLoc getLocation() { return loc; }
|
|
||||||
public String getQuals() { return pileupQuals; }
|
|
||||||
public char getRef() { return refBaseChar; }
|
|
||||||
public int size() { return pileupQuals.length(); }
|
|
||||||
public String getBases() { return pileupBases; }
|
|
||||||
|
|
||||||
public String getPileupString()
|
|
||||||
{
|
|
||||||
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Returns bases in the reference allele as a String. String can be empty (as in insertion into
|
|
||||||
* the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters
|
|
||||||
* (for longer indels).
|
|
||||||
*
|
|
||||||
* @return reference allele, forward strand
|
|
||||||
*/
|
|
||||||
public String getRefBasesFWD() {
|
|
||||||
return refBases;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException
|
|
||||||
* if variant is not a SNP.
|
|
||||||
*
|
|
||||||
* @return reference base on the forward strand
|
|
||||||
*/
|
|
||||||
/* public char getRefSnpFWD() throws IllegalStateException {
|
|
||||||
if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP");
|
|
||||||
return refBaseChar;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<String> getFWDAlleles() {
|
|
||||||
return observedAlleles;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// What kind of variant are we?
|
|
||||||
//
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
public boolean isSNP() { return varType == SNP_VARIANT ; }
|
|
||||||
public boolean isInsertion() { return varType == INSERTION_VARIANT; }
|
|
||||||
public boolean isDeletion() { return varType == DELETION_VARIANT ; }
|
|
||||||
public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; }
|
|
||||||
public boolean isReference() { return varType == NO_VARIANT; }
|
|
||||||
|
|
||||||
public boolean isHom() {
|
|
||||||
// implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
|
|
||||||
if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) );
|
|
||||||
return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isHet() {
|
|
||||||
// implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
|
|
||||||
if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) );
|
|
||||||
return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// formatting
|
|
||||||
//
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
public String toString() {
|
|
||||||
return String.format("%s\t%d\t%d\t%s\t%s\t%s",
|
|
||||||
getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toSimpleString() {
|
|
||||||
return String.format("%s:%s", name, observedString);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toMediumString() {
|
|
||||||
|
|
||||||
String s = null;
|
|
||||||
if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
|
|
||||||
else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
|
|
||||||
if ( isSNP() ) s += ": SNP";
|
|
||||||
else {
|
|
||||||
if ( isInsertion() ) s += ": Insertion";
|
|
||||||
else {
|
|
||||||
if ( isDeletion() ) s+= ": Deletion";
|
|
||||||
else {
|
|
||||||
if ( isIndel() ) s+=": Indel";
|
|
||||||
else s+=": Reference";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String repl() {
|
|
||||||
return String.format("REPL not implemented yet");
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
@Override
|
|
||||||
public String getAltBasesFWD() {
|
|
||||||
if ( ! isSNP() && ! isIndel() ) return emptyStr;
|
|
||||||
|
|
||||||
if ( isSNP() ) {
|
|
||||||
if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1);
|
|
||||||
else return observedAlleles.get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( isInsertion() ) {
|
|
||||||
if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1);
|
|
||||||
else return observedAlleles.get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( isDeletion() ) {
|
|
||||||
if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0);
|
|
||||||
else return observedAlleles.get(1);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString());
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
/** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator,
|
||||||
|
* so this method does nothing at all (always returns false).
|
||||||
|
*
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public char getAltSnpFWD() throws IllegalStateException {
|
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||||
if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP");
|
return false;
|
||||||
if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0);
|
|
||||||
else return observedAlleles.get(0).charAt(0);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double getConsensusConfidence() {
|
public String toString() {
|
||||||
return consensusScore;
|
StringBuilder b = new StringBuilder();
|
||||||
|
if ( pointGenotype != null ) {
|
||||||
|
b.append(pointGenotype.toString());
|
||||||
|
if ( indelGenotype != null ) b.append('\n');
|
||||||
}
|
}
|
||||||
*/
|
if ( indelGenotype != null ) b.append(indelGenotype.toString());
|
||||||
|
return b.toString();
|
||||||
/*
|
|
||||||
@Override
|
|
||||||
public int getPloidy() throws IllegalStateException {
|
|
||||||
return 2; // ???
|
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double getVariantConfidence() {
|
public Genotype getIndelGenotype() {
|
||||||
return variantScore;
|
return indelGenotype;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Genotype getPointGenotype() {
|
||||||
|
return pointGenotype;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasIndelGenotype() {
|
||||||
|
return indelGenotype != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasPointGenotype() {
|
||||||
|
return pointGenotype != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Iterates over SAM pileup multiline records: each new invocation of next() will
|
||||||
|
* move to next genomic position in the pileup file, and if the record for that position
|
||||||
|
* consists of multiple lines (as is the case for indels), they all will be read and processed.
|
||||||
|
* @author asivache
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private static class rodSAMPileupIterator implements Iterator<rodSAMPileup> {
|
||||||
|
//private xReadLines parser = null;
|
||||||
|
private String rodName = null;
|
||||||
|
private PushbackIterator<SAMPileupRecord> parser = null;
|
||||||
|
|
||||||
|
rodSAMPileupIterator(String name, java.io.File f) {
|
||||||
|
parser = new PushbackIterator<SAMPileupRecord>(SAMPileupRecord.createIterator(name,f));
|
||||||
|
rodName = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return parser.hasNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public rodSAMPileup next() {
|
||||||
|
|
||||||
|
rodSAMPileup result = new rodSAMPileup(rodName);
|
||||||
|
|
||||||
|
SAMPileupRecord r = parser.next();
|
||||||
|
//System.out.printf("Line is %s%n", line);
|
||||||
|
|
||||||
|
if ( r.isPointGenotype() ) result.pointGenotype = r;
|
||||||
|
else result.indelGenotype = r;
|
||||||
|
|
||||||
|
if ( parser.hasNext() ) {
|
||||||
|
|
||||||
|
SAMPileupRecord r2 = parser.next();
|
||||||
|
int cmp = r.getLocation().compareTo(r2.getLocation());
|
||||||
|
switch ( cmp ) {
|
||||||
|
case -1 : // next record is at greater position
|
||||||
|
parser.pushback(r2); // return next record back to the stream
|
||||||
|
break;
|
||||||
|
case 0: // next record is at the same position; in this case r MUST be point and r2 MUST be indel
|
||||||
|
if ( result.indelGenotype != null ) { // oops, we've seen indel already
|
||||||
|
if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: point genotype follows indel genotype line at "+r.getLocation() );
|
||||||
|
else throw new StingException("SAM pileup file might be corrupted: two indel genotype lines found at same position "+r.getLocation() );
|
||||||
|
} else {
|
||||||
|
// ok, the first genotype we've seen was a point one
|
||||||
|
if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: two point genotype lines found at same position "+r.getLocation() );
|
||||||
|
// and the second is an indel, wheww...
|
||||||
|
result.indelGenotype = r2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the following is solely for the purposes of strict validation of pileup file: since we've already read two lines at the
|
||||||
|
// the same genomic location (point and indel variants), there can not be any more:
|
||||||
|
if ( parser.hasNext() ) {
|
||||||
|
r2 = parser.next();
|
||||||
|
cmp = r.getLocation().compareTo(r2.getLocation() ) ;
|
||||||
|
if ( cmp == 0 ) throw new StingException("SAM pileup file is corrupted: more than two lines found at position "+r.getLocation());
|
||||||
|
if ( cmp > 0 ) throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position "
|
||||||
|
+ r.getLocation() );
|
||||||
|
parser.pushback(r2);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
case 1: // next record is at lower position?? come'on!
|
||||||
|
throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + r.getLocation() );
|
||||||
|
default:
|
||||||
|
throw new StingException("INTERNAL ERROR: this point should never be reached");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isBiallelic() {
|
public void remove() {
|
||||||
return nNonref < 2;
|
throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public double getConsensusConfidence() {
|
|
||||||
return consensusScore;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public static Iterator<rodSAMPileup> createIterator(String name, java.io.File file) {
|
||||||
public String getFWDRefBases() {
|
return new rodSAMPileup.rodSAMPileupIterator(name,file);
|
||||||
return refBases;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String argv[]) {
|
||||||
|
String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
|
||||||
|
// String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
|
||||||
|
|
||||||
|
Iterator<rodSAMPileup> it = createIterator("test-normal", new java.io.File(testFile));
|
||||||
|
|
||||||
|
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
|
||||||
|
new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||||||
|
// new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
|
||||||
|
);
|
||||||
|
|
||||||
|
if ( reference.getSequenceDictionary() == null ) {
|
||||||
|
System.out.println("No reference sequence dictionary found. Abort.");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc.setupRefContigOrdering(reference.getSequenceDictionary());
|
||||||
|
|
||||||
|
int counter = 0;
|
||||||
|
|
||||||
|
while ( it.hasNext() && counter < 430 ) {
|
||||||
|
rodSAMPileup p = it.next();
|
||||||
|
System.out.println(p.toString());
|
||||||
|
/*
|
||||||
|
System.out.print(p.getLocation().toString());
|
||||||
|
System.out.print('\t');
|
||||||
|
|
||||||
|
if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
|
||||||
|
else {
|
||||||
|
if ( p.isSNP() ) { System.out.print("SNP"); }
|
||||||
|
else {
|
||||||
|
if ( p.isIndel() ) { System.out.print("Indel"); }
|
||||||
|
else { System.out.print("REF"); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.print('\t');
|
||||||
|
System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
|
||||||
|
System.out.print('\t');
|
||||||
|
System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
|
||||||
|
*/
|
||||||
|
counter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue