General purpose pileup code -- you can use these features to obtain detailed pileup data from reads and offsets. Useful for all pileup based walkers. Expanded support for rodSAMPileup to enable the new ValidatingPileupWalker, which takes a samtools pileup output and checks that GATK gives identical output as samtools on a per base and per qual pileup. It's going to be a very useful validation tool.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@418 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-14 22:13:10 +00:00
parent baae98c6d5
commit 72a3d84ed2
11 changed files with 650 additions and 328 deletions

View File

@ -176,8 +176,12 @@ public class GenomeAnalysisTK extends CommandLineProgram {
* Required main method implementation. * Required main method implementation.
*/ */
public static void main(String[] argv) { public static void main(String[] argv) {
try {
Instance = new GenomeAnalysisTK(); Instance = new GenomeAnalysisTK();
start(Instance, argv); start(Instance, argv);
} catch ( Exception e ) {
exitSystemWithError(e);
}
} }
/** /**
@ -195,6 +199,30 @@ public class GenomeAnalysisTK extends CommandLineProgram {
ROD_BINDINGS.add(file); ROD_BINDINGS.add(file);
} }
private static void printExitSystemMsg(final String msg) {
System.out.printf("------------------------------------------------------------------------------------------%n");
System.out.printf("An error has occurred%n");
System.out.printf("It's possible (maybe even likely) that this is an input error on your part%n");
System.out.printf("But if it's a bug or something that should work, please report this to gsadevelopers@broad.mit.edu%n");
System.out.printf("%n");
System.out.printf("%s%n", msg);
}
public static void exitSystemWithError(final String msg) {
printExitSystemMsg(msg);
System.exit(1);
}
public static void exitSystemWithError(final String msg, Exception e ) {
e.printStackTrace();
printExitSystemMsg(msg);
System.exit(1);
}
public static void exitSystemWithError(Exception e ) {
exitSystemWithError(e.getMessage(), e);
}
protected int execute() { protected int execute() {
final boolean TEST_ROD = false; final boolean TEST_ROD = false;
List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum> >(); List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum> >();

View File

@ -2,11 +2,12 @@ package org.broadinstitute.sting.gatk.refdata;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant; import org.broadinstitute.sting.utils.Pileup;
/** /**
* This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br> * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br>
@ -24,7 +25,7 @@ import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
* Time: 2:58:33 PM * Time: 2:58:33 PM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant { public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant, Pileup {
private static final int NO_VARIANT = -1; private static final int NO_VARIANT = -1;
private static final int SNP_VARIANT = 0; private static final int SNP_VARIANT = 0;
private static final int INSERTION_VARIANT = 1; private static final int INSERTION_VARIANT = 1;
@ -47,6 +48,9 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
protected String refBases; // the reference base according to NCBI, in the dbSNP file protected String refBases; // the reference base according to NCBI, in the dbSNP file
protected String observedString; // store the actual string representation of observed alleles protected String observedString; // store the actual string representation of observed alleles
protected String pileupQuals; // the read base qualities
protected String pileupBases; // the read bases themselves
protected List<String> observedAlleles = null; // The sequences of the observed alleles from rs-fasta files protected List<String> observedAlleles = null; // The sequences of the observed alleles from rs-fasta files
protected int varType = NO_VARIANT; protected int varType = NO_VARIANT;
protected int ploidy = 2; // how many allelic variants we get? protected int ploidy = 2; // how many allelic variants we get?
@ -82,6 +86,8 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
parts[3] = parts[3].toUpperCase(); parts[3] = parts[3].toUpperCase();
observedString = parts[3]; observedString = parts[3];
parseBasesAndQuals(parts[8], parts[9]);
observedAlleles = new ArrayList<String>(2); observedAlleles = new ArrayList<String>(2);
if ( refBaseChar == '*' ) { if ( refBaseChar == '*' ) {
@ -94,7 +100,8 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter"; assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter";
refBases = parts[2].toUpperCase(); refBases = parts[2].toUpperCase();
eventLength = 1; eventLength = 1;
loc = new GenomeLoc(contig, start, start+1); //loc = new GenomeLoc(contig, start, start+1);
loc = new GenomeLoc(contig, start, start);
char ch = parts[3].charAt(0); char ch = parts[3].charAt(0);
@ -124,7 +131,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
} }
} }
} catch ( RuntimeException e ) { } catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts)); System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
throw e; throw e;
} }
if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
@ -188,8 +195,72 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
} }
private void parseBasesAndQuals(final String bases, final String quals)
{
//System.out.printf("%s%n%s%n", bases, quals);
// needs to convert the base string with it's . and , to the ref base
StringBuilder baseBuilder = new StringBuilder();
StringBuilder qualBuilder = new StringBuilder();
boolean done = false;
for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
//System.out.printf("%d %d%n", i, j);
char c = (char)bases.charAt(i);
switch ( c ) {
case '.': // matches reference
case ',': // matches reference
baseBuilder.append(refBaseChar);
qualBuilder.append((char)quals.charAt(j++));
break;
case '$': // end of read
break;
case '*': // end of indel?
j++;
break;
case '^': // mapping quality
i++;
break;
case '+': // start of indel
case '-': // start of indel
final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
final String rest = bases.substring(i+1);
//System.out.printf("sub is %s%n", rest);
Matcher match = regex.matcher(rest);
if ( ! match.matches() ) {
if ( refBaseChar != '*' )
throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
done = true;
}
else {
String g = match.group(1);
//System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
int l = Integer.parseInt(g);
i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
//System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
}
break;
default: // non reference base
baseBuilder.append(c);
qualBuilder.append((char)quals.charAt(j++));
}
}
pileupBases = baseBuilder.toString();
pileupQuals = qualBuilder.toString();
}
public GenomeLoc getLocation() { return loc; } public GenomeLoc getLocation() { return loc; }
public String getQuals() { return pileupQuals; }
public char getRef() { return refBaseChar; }
public int size() { return pileupQuals.length(); }
public String getBases() { return pileupBases; }
public String getPileupString()
{
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
}
/** Returns bases in the reference allele as a String. String can be empty (as in insertion into /** Returns bases in the reference allele as a String. String can be empty (as in insertion into
* the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters

View File

@ -159,7 +159,7 @@ public class TraverseByLoci extends TraversalEngine {
sum = walker.reduce(x, sum); sum = walker.reduce(x, sum);
} }
printProgress("loci", locus.getLocation()); //printProgress("loci", locus.getLocation());
return sum; return sum;
} }
} }

View File

@ -6,6 +6,9 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pileup;
import org.broadinstitute.sting.utils.BasicPileup;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import java.util.List; import java.util.List;
@ -33,10 +36,8 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
} }
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
List<SAMRecord> reads = context.getReads(); ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref, context);
List<Integer> offsets = context.getOffsets(); String bases = pileup.getBases();
String bases = Utils.basePileupAsString(reads, offsets);
String quals = Utils.qualPileupAsString(reads, offsets);
if ( bases.equals("") && FLAG_UNCOVERED_BASES ) { if ( bases.equals("") && FLAG_UNCOVERED_BASES ) {
bases = "*** UNCOVERED SITE ***"; bases = "*** UNCOVERED SITE ***";
@ -44,9 +45,11 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
String extras = ""; String extras = "";
if ( VERBOSE ) { if ( VERBOSE ) {
String sqbases = Utils.secondaryBasePileupAsString(reads, offsets); extras += " BQ=" + pileup.getQualsAsInts();
String sqquals = Utils.secondaryQualPileupAsString(reads, offsets); extras += " MQ=" + pileup.getMappingQualsAsInts();
String sqbases = pileup.getSecondaryBasePileup();
String sqquals = pileup.getSecondaryQualPileup();
if (sqbases != null && sqquals != null) { if (sqbases != null && sqquals != null) {
assert(sqbases.length() == sqquals.length()); assert(sqbases.length() == sqquals.length());
@ -60,17 +63,17 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
} }
} }
} }
extras += " BQ=" + Utils.join(",", Utils.qualPileup(reads, offsets));
extras += " MQ=" + Utils.join(",", Utils.mappingQualPileup(reads));
} }
String rodString = ""; String rodString = "";
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) { for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
if ( datum != null && ! (datum instanceof rodDbSNP)) { if ( datum != null && ! (datum instanceof rodDbSNP)) {
//System.out.printf("rod = %s%n", datum.toSimpleString());
rodString += datum.toSimpleString(); rodString += datum.toSimpleString();
//System.out.printf("Rod string %s%n", rodString);
} }
} }
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
if ( dbsnp != null ) if ( dbsnp != null )
rodString += dbsnp.toMediumString(); rodString += dbsnp.toMediumString();
@ -79,7 +82,7 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
rodString = "[ROD: " + rodString + "]"; rodString = "[ROD: " + rodString + "]";
//if ( context.getLocation().getStart() % 1 == 0 ) { //if ( context.getLocation().getStart() % 1 == 0 ) {
out.printf("%s: %s %s %s%s %s%n", context.getLocation(), ref, bases, quals, extras, rodString); out.printf("%s%s %s%n", pileup.getPileupString(), extras, rodString);
//} //}
return 1; return 1;

View File

@ -0,0 +1,58 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BasicPileup;
import org.broadinstitute.sting.utils.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 3:22:14 PM
* To change this template use File | Settings | File Templates.
*/
public class ValidatingPileupWalker extends LocusWalker<Integer, ValidationStats> {
@Argument(fullName="verbose",required=false,defaultValue="false")
public boolean VERBOSE;
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref, context);
rodSAMPileup truePileup = (rodSAMPileup)tracker.lookup("pileup", null);
if ( truePileup == null )
Utils.scareUser(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
context.getLocation(), pileup.getBases()));
String pileupDiff = BasicPileup.pileupDiff(pileup, truePileup);
if ( pileupDiff != null ) {
out.printf("%s vs. %s%n", pileup.getPileupString(), truePileup.getPileupString());
throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
}
return pileup.size();
}
// Given result of map function
public ValidationStats reduceInit() { return new ValidationStats(); }
public ValidationStats reduce(Integer value, ValidationStats sum) {
sum.nLoci++;
sum.nBases += value;
return sum;
}
}
class ValidationStats {
public long nLoci = 0;
public long nBases = 0;
public ValidationStats() {
}
public String toString() {
return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases);
}
}

View File

@ -0,0 +1,178 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.LocusContext;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 14, 2009
* Time: 8:54:05 AM
* To change this template use File | Settings | File Templates.
*/
abstract public class BasicPileup implements Pileup {
public String getPileupString()
{
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
}
public static String basePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases = new StringBuilder();
for ( byte base : basePileup(reads, offsets)) {
bases.append((char)base);
}
return bases.toString();
}
public static ArrayList<Byte> basePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases.add(read.getReadBases()[offset]);
}
return bases;
}
public static ArrayList<Byte> qualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte qual = (byte)read.getBaseQualities()[offset];
quals.add(qual);
}
return quals;
}
public static ArrayList<Byte> mappingQualPileup( List<SAMRecord> reads) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
byte qual = (byte)read.getMappingQuality();
quals.add(qual);
}
return quals;
}
public static String qualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals = new StringBuilder();
for ( int qual : qualPileup(reads, offsets)) {
qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea
char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63
quals.append(qualChar);
}
return quals.toString();
}
public static ArrayList<Byte> secondaryBasePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte base2;
//byte qual2;
if (compressedQuals != null) {
base2 = (byte) BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(compressedQuals[offset]));
//qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
base2 = (byte) '.';
}
bases2.add(base2);
}
return (hasAtLeastOneSQField ? bases2 : null);
}
public static String secondaryBasePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases2 = new StringBuilder();
ArrayList<Byte> sqbases = secondaryBasePileup(reads, offsets);
if (sqbases == null) { return null; }
for (byte base2 : secondaryBasePileup(reads, offsets)) {
bases2.append((char) base2);
}
return bases2.toString();
}
public static ArrayList<Byte> secondaryQualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte qual2;
if (compressedQuals != null) {
qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
qual2 = 0;
}
quals2.add(qual2);
}
return (hasAtLeastOneSQField ? quals2 : null);
}
public static String secondaryQualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals2 = new StringBuilder();
ArrayList<Byte> sqquals = secondaryQualPileup(reads, offsets);
if (sqquals == null) { return null; }
for (byte qual2 : secondaryQualPileup(reads, offsets)) {
quals2.append(qual2);
}
return quals2.toString();
}
public static String pileupDiff(final Pileup a, final Pileup b)
{
return pileupDiff(a,b,true);
}
private static String maybeSorted( final String x, boolean sortMe )
{
if ( sortMe ) {
byte[] bytes = x.getBytes();
Arrays.sort(bytes);
return new String(bytes);
}
else
return x;
}
public static String pileupDiff(final Pileup a, final Pileup b, boolean orderDependent)
{
if ( a.size() != b.size() )
return "Sizes not equal";
if ( a.getLocation().compareTo(b.getLocation()) != 0 )
return "Locations not equal";
String aBases = maybeSorted(a.getBases(), ! orderDependent );
String bBases = maybeSorted(b.getBases(), ! orderDependent );
if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) )
return "Bases not equal";
String aQuals = maybeSorted(a.getQuals(), ! orderDependent );
String bQuals = maybeSorted(b.getQuals(), ! orderDependent );
if ( ! aQuals.equals(bQuals) )
return "Quals not equal";
return null;
}
}

View File

@ -170,6 +170,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2 final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2
final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$"); // matches case 3 final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$"); // matches case 3
final Pattern regex4 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)\\+"); // matches case 4
String contig = null; String contig = null;
long start = 1; long start = 1;
@ -179,6 +180,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
Matcher match1 = regex1.matcher(str); Matcher match1 = regex1.matcher(str);
Matcher match2 = regex2.matcher(str); Matcher match2 = regex2.matcher(str);
Matcher match3 = regex3.matcher(str); Matcher match3 = regex3.matcher(str);
Matcher match4 = regex4.matcher(str);
try { try {
if ( match1.matches() ) { if ( match1.matches() ) {
@ -189,6 +191,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
start = parsePosition(match2.group(2)); start = parsePosition(match2.group(2));
stop = start; stop = start;
} }
else if ( match4.matches() ) {
contig = match4.group(1);
start = parsePosition(match4.group(2));
}
else if ( match3.matches() ) { else if ( match3.matches() ) {
contig = match3.group(1); contig = match3.group(1);
start = parsePosition(match3.group(2)); start = parsePosition(match3.group(2));

View File

@ -0,0 +1,17 @@
package org.broadinstitute.sting.utils;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 14, 2009
* Time: 9:33:00 AM
* To change this template use File | Settings | File Templates.
*/
public interface Pileup {
GenomeLoc getLocation();
char getRef();
String getBases();
String getQuals();
String getPileupString();
int size();
}

View File

@ -0,0 +1,77 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.LocusContext;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 14, 2009
* Time: 8:54:05 AM
* To change this template use File | Settings | File Templates.
*/
public class ReadBackedPileup extends BasicPileup {
GenomeLoc loc;
char ref;
List<SAMRecord> reads;
List<Integer> offsets;
public ReadBackedPileup(GenomeLoc loc, char ref, LocusContext context ) {
this(loc, ref, context.getReads(), context.getOffsets());
}
public ReadBackedPileup(GenomeLoc loc, char ref, List<SAMRecord> reads, List<Integer> offsets ) {
assert reads != null;
assert offsets != null;
assert reads.size() == offsets.size();
this.loc = loc;
this.ref = ref;
this.reads = reads;
this.offsets = offsets;
}
public int size() { return reads.size(); }
public List<SAMRecord> getReads() { return reads; }
public List<Integer> getOffsets() { return offsets; }
public GenomeLoc getLocation() {
return loc;
}
public char getRef() {
return ref;
}
public String getBases() {
return basePileupAsString(reads, offsets);
}
public String getQuals() {
return qualPileupAsString(reads, offsets);
}
public String getQualsAsInts() {
return Utils.join(",", qualPileup(reads, offsets));
}
public String getMappingQualsAsInts() {
return Utils.join(",", mappingQualPileup(reads));
}
public String getSecondaryBasePileup() {
return secondaryBasePileupAsString(reads, offsets);
}
public String getSecondaryQualPileup() {
return secondaryQualPileupAsString(reads, offsets);
}
public String getPileupString()
{
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
}
}

View File

@ -35,11 +35,11 @@ public class Utils {
} }
public static void scareUser(final String msg) { public static void scareUser(final String msg) {
System.out.printf("********************************************************************************%n"); //System.out.printf("********************************************************************************%n");
System.out.printf("* ERROR:%n"); //System.out.printf("* ERROR:%n");
System.out.printf("*%n"); //System.out.printf("*%n");
System.out.printf("* %s%n", msg); //System.out.printf("* %s%n", msg);
System.out.printf("********************************************************************************%n"); //System.out.printf("********************************************************************************%n");
logger.fatal(msg); logger.fatal(msg);
throw new RuntimeException(msg); throw new RuntimeException(msg);
} }
@ -119,126 +119,6 @@ public class Utils {
return c; return c;
} }
public static String basePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases = new StringBuilder();
for ( byte base : basePileup(reads, offsets)) {
bases.append((char)base);
}
return bases.toString();
}
public static ArrayList<Byte> basePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases.add(read.getReadBases()[offset]);
}
return bases;
}
public static ArrayList<Byte> qualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte qual = (byte)read.getBaseQualities()[offset];
quals.add(qual);
}
return quals;
}
public static ArrayList<Byte> mappingQualPileup( List<SAMRecord> reads) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
byte qual = (byte)read.getMappingQuality();
quals.add(qual);
}
return quals;
}
public static String qualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals = new StringBuilder();
for ( int qual : qualPileup(reads, offsets)) {
qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea
char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63
quals.append(qualChar);
}
return quals.toString();
}
public static ArrayList<Byte> secondaryBasePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte base2;
//byte qual2;
if (compressedQuals != null) {
base2 = (byte) BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(compressedQuals[offset]));
//qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
base2 = (byte) '.';
}
bases2.add(base2);
}
return (hasAtLeastOneSQField ? bases2 : null);
}
public static String secondaryBasePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases2 = new StringBuilder();
ArrayList<Byte> sqbases = secondaryBasePileup(reads, offsets);
if (sqbases == null) { return null; }
for (byte base2 : secondaryBasePileup(reads, offsets)) {
bases2.append((char) base2);
}
return bases2.toString();
}
public static ArrayList<Byte> secondaryQualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte qual2;
if (compressedQuals != null) {
qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
qual2 = 0;
}
quals2.add(qual2);
}
return (hasAtLeastOneSQField ? quals2 : null);
}
public static String secondaryQualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals2 = new StringBuilder();
ArrayList<Byte> sqquals = secondaryQualPileup(reads, offsets);
if (sqquals == null) { return null; }
for (byte qual2 : secondaryQualPileup(reads, offsets)) {
quals2.append(qual2);
}
return quals2.toString();
}
public static ArrayList<Byte> subseq(byte[] fullArray) { public static ArrayList<Byte> subseq(byte[] fullArray) {
return subseq(fullArray, 0, fullArray.length); return subseq(fullArray, 0, fullArray.length);
} }

View File

@ -51,6 +51,10 @@ if __name__ == "__main__":
head, lane_filename = os.path.split(lane) head, lane_filename = os.path.split(lane)
filebase = os.path.splitext(lane_filename)[0] filebase = os.path.splitext(lane_filename)[0]
# make the pileup
# 503 8:03 samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam
# 504 8:03 samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam
# convert the fasta # convert the fasta
for analysis in commandsList: for analysis in commandsList:
output = os.path.join(directory, filebase + '.' + analysis + '.output') output = os.path.join(directory, filebase + '.' + analysis + '.output')