@BasicPileup - made the counts public so they can be used
@PoolUtils - split reads by indel/simple base @BaseTransitionTable - complete refactoring, nicer now @UnifiedArgumentCollection - added PoolSize as an argument @UnifiedGenotyper - checks to ensure pooled sequencing uses the appropriate model @GenotypeCalculationModel - instantiates with the new PoolSize argument git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1867 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bdb34fcf38
commit
ad777a9c14
|
|
@ -26,6 +26,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
||||||
protected boolean GENOTYPE_MODE;
|
protected boolean GENOTYPE_MODE;
|
||||||
protected boolean POOLED_INPUT;
|
protected boolean POOLED_INPUT;
|
||||||
|
protected int POOL_SIZE;
|
||||||
protected double LOD_THRESHOLD;
|
protected double LOD_THRESHOLD;
|
||||||
protected int maxDeletionsInPileup;
|
protected int maxDeletionsInPileup;
|
||||||
protected boolean VERBOSE;
|
protected boolean VERBOSE;
|
||||||
|
|
@ -53,6 +54,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
defaultPlatform = UAC.defaultPlatform;
|
defaultPlatform = UAC.defaultPlatform;
|
||||||
GENOTYPE_MODE = UAC.GENOTYPE;
|
GENOTYPE_MODE = UAC.GENOTYPE;
|
||||||
POOLED_INPUT = UAC.POOLED;
|
POOLED_INPUT = UAC.POOLED;
|
||||||
|
POOL_SIZE = UAC.POOLSIZE;
|
||||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||||
VERBOSE = UAC.VERBOSE;
|
VERBOSE = UAC.VERBOSE;
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
public class UnifiedArgumentCollection {
|
public class UnifiedArgumentCollection {
|
||||||
|
|
||||||
// control the various models to be used
|
// control the various models to be used
|
||||||
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development", required = false)
|
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development. An exception will be thrown if an attempt is made to use EM_POINT_ESTIMATE with a pooled genotype calculation.", required = false)
|
||||||
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
||||||
|
|
||||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||||
|
|
@ -43,6 +43,9 @@ public class UnifiedArgumentCollection {
|
||||||
@Argument(fullName = "pooled", shortName = "pooled", doc = "Does the input bam represent pooled data (so that genotypes can't be called)?", required = false)
|
@Argument(fullName = "pooled", shortName = "pooled", doc = "Does the input bam represent pooled data (so that genotypes can't be called)?", required = false)
|
||||||
public boolean POOLED = false;
|
public boolean POOLED = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool", required = false)
|
||||||
|
public int POOLSIZE = 0;
|
||||||
|
|
||||||
|
|
||||||
// control the output
|
// control the output
|
||||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
|
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
|
||||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
||||||
|
|
@ -88,6 +89,10 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
||||||
**/
|
**/
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
|
if ( UAC.POOLED && UAC.genotypeModel == GenotypeCalculationModel.Model.EM_POINT_ESTIMATE ) {
|
||||||
|
throw new StingException("This was an attempt to use an EM Point Estimate model with pooled genotype calculations. This model does not work with pooled data.");
|
||||||
|
}
|
||||||
|
|
||||||
// get all of the unique sample names
|
// get all of the unique sample names
|
||||||
samples = new HashSet<String>();
|
samples = new HashSet<String>();
|
||||||
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,9 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
|
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.ListIterator;
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
|
@ -26,27 +25,37 @@ import net.sf.samtools.SAMRecord;
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
@By(DataSource.REFERENCE)
|
@By(DataSource.REFERENCE)
|
||||||
public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<ReferenceContextWindow,BaseTransitionTable>{
|
public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<ReferenceContextWindow,Integer>{
|
||||||
@Argument(fullName="usePreviousBases", doc="Use previous bases as part of the calculation, uses the specified number, defaults to 0", required=false)
|
@Argument(fullName="usePreviousBases", doc="Use previous bases of the reference as part of the calculation, uses the specified number, defaults to 0", required=false)
|
||||||
int nPreviousBases = 0;
|
int nPreviousBases = 0;
|
||||||
@Argument(fullName="useSecondaryBase",doc="Use the secondary base of a read as part of the calculation", required=false)
|
@Argument(fullName="useSecondaryBase",doc="Use the secondary base of a read as part of the calculation", required=false)
|
||||||
boolean useSecondaryBase = false;
|
boolean useSecondaryBase = false;
|
||||||
@Argument(fullName="confidentRefThreshold",doc="Set the lod score that defines confidence in ref, defaults to 4", required=false)
|
@Argument(fullName="confidentRefThreshold",doc="Set the lod score that defines confidence in ref, defaults to 4", required=false)
|
||||||
int confidentRefThreshold = 5;
|
int confidentRefThreshold = 5;
|
||||||
@Argument(fullName="pileupMismatchThreshold",doc="Set the maximum number of mismatches at a locus before choosing not to use it in calculation. Defaults to 1.", required=false)
|
@Argument(fullName="maxNumMismatches",doc="Set the maximum number of mismatches at a locus before choosing not to use it in calculation. Defaults to 1.", required=false)
|
||||||
int pileupMismatchThreshold = 1;
|
int maxNumMismatches = 1;
|
||||||
|
@Argument(fullName="minMappingQuality", doc ="Set the alignment quality below which to ignore reads; defaults to 60", required = false)
|
||||||
|
int minMappingQuality = 60;
|
||||||
|
@Argument(fullName="minQualityScore", doc = "Set the base quality score below which to ignore bases in the pileup, defaults to 30", required = false)
|
||||||
|
int minQualityScore = 30;
|
||||||
|
@Argument(fullName="usePileupMismatches", doc = "Use the number of mismatches in the pileup as a condition for the table")
|
||||||
|
boolean usePileupMismatches = false;
|
||||||
|
@Argument(fullName="usePreviousReadBases", doc="Use previous bases of the read as part of the calculation. Will ignore reads if there aren't this many previous bases. Uses the specified number. Defaults to 0")
|
||||||
|
int nPreviousReadBases = 0;
|
||||||
|
|
||||||
private UnifiedGenotyper ug;
|
private UnifiedGenotyper ug;
|
||||||
private ReferenceContextWindow refWindow;
|
private ReferenceContextWindow refWindow;
|
||||||
|
private Set<BaseTransitionTable> conditionalTables;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
ug = new UnifiedGenotyper();
|
ug = new UnifiedGenotyper();
|
||||||
ug.initialize();
|
ug.initialize();
|
||||||
refWindow = new ReferenceContextWindow(nPreviousBases);
|
refWindow = new ReferenceContextWindow(nPreviousBases);
|
||||||
|
conditionalTables = new HashSet<BaseTransitionTable>();
|
||||||
}
|
}
|
||||||
|
|
||||||
public BaseTransitionTable reduceInit() {
|
public Integer reduceInit() {
|
||||||
return new BaseTransitionTable(nPreviousBases,useSecondaryBase);
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReferenceContextWindow map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
public ReferenceContextWindow map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||||
|
|
@ -56,36 +65,133 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Referen
|
||||||
return refWindow;
|
return refWindow;
|
||||||
}
|
}
|
||||||
|
|
||||||
public BaseTransitionTable reduce( ReferenceContextWindow map, BaseTransitionTable confusionCounts ) {
|
public Integer reduce ( ReferenceContextWindow map, Integer prevReduce ) {
|
||||||
// todo -- refactor
|
|
||||||
if ( map.isValidWindow() ) {
|
if ( map.isValidWindow() ) {
|
||||||
List<SAMRecord> reads = map.getPileup().getReads();
|
List<SAMRecord> reads = map.getPileup().getReads();
|
||||||
List<Integer> offsets = map.getPileup().getOffsets();
|
List<Integer> offsets = map.getPileup().getOffsets();
|
||||||
ReferenceContext ref = map.getMiddleReferenceContext();
|
ReferenceContext ref = map.getMiddleReferenceContext();
|
||||||
String forwardContext = map.getForwardRefString();
|
|
||||||
String reverseContext = map.getReverseRefString();
|
|
||||||
for ( int r = 0; r < reads.size(); r ++ ) {
|
for ( int r = 0; r < reads.size(); r ++ ) {
|
||||||
if ( includeRead(reads.get(r) ,offsets.get(r)) ) {
|
if ( useRead( reads.get(r), offsets.get(r), ref ) ) {
|
||||||
confusionCounts.update(createConfusionContext(reads.get(r),offsets.get(r),ref,forwardContext,reverseContext));
|
updateTables( reads.get(r), offsets.get(r), map );
|
||||||
|
prevReduce++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return confusionCounts;
|
return prevReduce;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone( BaseTransitionTable table ) {
|
public void onTraversalDone( Integer numValidObservedMismatchingReads ) {
|
||||||
out.printf("%s%n", makeHeader() );
|
logger.info(numValidObservedMismatchingReads);
|
||||||
table.print(out);
|
out.print(createHeaderFromConditions());
|
||||||
|
for ( BaseTransitionTable t : conditionalTables )
|
||||||
|
t.print(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void updateTables ( SAMRecord read, int offset, ReferenceContextWindow map ) {
|
||||||
|
List<Comparable> readConditions = buildConditions(read,offset,map);
|
||||||
|
|
||||||
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
boolean createNewTable = true;
|
||||||
Pair<List<GenotypeCall>, GenotypeMetaData> calls = ug.map(tracker,ref,context);
|
|
||||||
if (calls == null)
|
for ( BaseTransitionTable t : conditionalTables ) {
|
||||||
|
if ( t.conditionsMatch(readConditions) ) {
|
||||||
|
t.update(read.getReadBases()[offset], map.getMiddleReferenceContext().getBase());
|
||||||
|
createNewTable = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( createNewTable ) {
|
||||||
|
BaseTransitionTable t = new BaseTransitionTable(readConditions);
|
||||||
|
t.update(read.getReadBases()[offset],map.getMiddleReferenceContext().getBase());
|
||||||
|
conditionalTables.add(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean useRead( SAMRecord read, int offset, ReferenceContext ref ) {
|
||||||
|
|
||||||
|
if ( Character.toUpperCase(read.getReadBases()[offset]) == ref.getBase() ) {
|
||||||
return false;
|
return false;
|
||||||
return ( ! calls.first.get(0).isVariant(ref.getBase()) && calls.first.get(0).getNegLog10PError() >= confidentRefThreshold );
|
} else if ( read.getMappingQuality() < minMappingQuality ) {
|
||||||
|
return false;
|
||||||
|
} else if ( ! BaseUtils.isRegularBase( (char) read.getReadBases()[offset]) ) {
|
||||||
|
return false;
|
||||||
|
} else if ( read.getBaseQualities()[offset] < minQualityScore ) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Comparable> buildConditions( SAMRecord read, int offset, ReferenceContextWindow map ) {
|
||||||
|
ArrayList<Comparable> conditions = new ArrayList<Comparable>();
|
||||||
|
|
||||||
|
if ( nPreviousBases > 0 ) {
|
||||||
|
if ( read.getReadNegativeStrandFlag() )
|
||||||
|
conditions.add(map.getForwardRefString());
|
||||||
|
else
|
||||||
|
conditions.add(map.getReverseRefString());
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( useSecondaryBase ) {
|
||||||
|
conditions.add(getSecondaryBase(read,offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( nPreviousReadBases > 0 ) {
|
||||||
|
conditions.add(read.getReadString().substring(offset-nPreviousReadBases,offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( usePileupMismatches ) {
|
||||||
|
conditions.add(countMismatches(map.getPileup()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return conditions;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String createHeaderFromConditions() {
|
||||||
|
String header = "True_base\tObserved_base";
|
||||||
|
|
||||||
|
if ( nPreviousBases > 0) {
|
||||||
|
header = header+"\tPrevious_"+nPreviousBases+"_bases";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( useSecondaryBase ) {
|
||||||
|
header = header + "\tSecondary_base";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( nPreviousReadBases > 0 ) {
|
||||||
|
header = header + "\tPrevious_"+nPreviousReadBases+"_read_bases";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( usePileupMismatches ) {
|
||||||
|
header = header + "\tNumber_of_pileup_mismatches";
|
||||||
|
}
|
||||||
|
|
||||||
|
return String.format("%s\t%s%n",header,"Counts");
|
||||||
|
}
|
||||||
|
|
||||||
|
public int countMismatches(ReadBackedPileup p) {
|
||||||
|
int refM = 0;
|
||||||
|
|
||||||
|
switch (p.getRef()) {
|
||||||
|
case 'A': refM = BasicPileup.countBases(p.getBases()).a;
|
||||||
|
break;
|
||||||
|
case 'C': refM = BasicPileup.countBases(p.getBases()).c;
|
||||||
|
break;
|
||||||
|
case 'G': refM = BasicPileup.countBases(p.getBases()).g;
|
||||||
|
break;
|
||||||
|
case 'T': refM = BasicPileup.countBases(p.getBases()).t;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new StingException("Exception in countMismatches - this has been called for a non-reference base. Base character from pileup was " + p.getRef() );
|
||||||
|
}
|
||||||
|
|
||||||
|
return p.size()-refM;
|
||||||
|
}
|
||||||
|
|
||||||
|
public char getSecondaryBase ( SAMRecord read, int offset ) {
|
||||||
|
return BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex( ( (byte[]) read.getAttribute("SQ") )[offset] ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean baseIsUsable ( RefMetaDataTracker tracker, ReferenceContext ref, ReadBackedPileup pileup, AlignmentContext context ) {
|
public boolean baseIsUsable ( RefMetaDataTracker tracker, ReferenceContext ref, ReadBackedPileup pileup, AlignmentContext context ) {
|
||||||
|
|
@ -101,102 +207,19 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Referen
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return mismatches > pileupMismatchThreshold;
|
return mismatches >= maxNumMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||||
|
Pair<List<GenotypeCall>, GenotypeMetaData> calls = ug.map(tracker,ref,context);
|
||||||
|
if (calls == null)
|
||||||
|
return false;
|
||||||
|
return ( ! calls.first.get(0).isVariant(ref.getBase()) && calls.first.get(0).getNegLog10PError() >= confidentRefThreshold );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean refIsNotN(ReferenceContext ref) {
|
public boolean refIsNotN(ReferenceContext ref) {
|
||||||
return ref.getBase() != 'N';
|
return BaseUtils.isRegularBase(ref.getBase());
|
||||||
}
|
|
||||||
|
|
||||||
public boolean includeRead ( SAMRecord read, int offset) {
|
|
||||||
// todo -- do we want to filter out individual reads?
|
|
||||||
return ! readWindowContainsNonBaseCharacters(read,offset,read.getNotPrimaryAlignmentFlag());
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean readWindowContainsNonBaseCharacters( SAMRecord read, int offset, boolean isNegative ) {
|
|
||||||
byte[] bases = read.getReadBases();
|
|
||||||
// System.out.println("readWindowContainsNonBaseCharacters");
|
|
||||||
if ( ! isNegative ) {
|
|
||||||
for ( int i = offset; i <= offset + nPreviousBases; i ++ ) {
|
|
||||||
char base = Character.toUpperCase(convertIUPACByteToChar(bases[i]));
|
|
||||||
// System.out.println(base);
|
|
||||||
if ( ! ( base == 'A' || base == 'G' || base == 'C' || base == 'T') ) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
for ( int i = offset; i >= offset - nPreviousBases; i -- ) {
|
|
||||||
char base = Character.toUpperCase(convertIUPACByteToChar(bases[i]));
|
|
||||||
// System.out.println(base);
|
|
||||||
if ( ! ( base == 'A' || base == 'G' || base == 'C' || base == 'T') ) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Pair<String,char[]> createConfusionContext(SAMRecord read, int offset, ReferenceContext ref, String refFor, String refRev) {
|
|
||||||
char[] baseChars;
|
|
||||||
if ( useSecondaryBase ) {
|
|
||||||
baseChars = new char[3];
|
|
||||||
} else {
|
|
||||||
baseChars = new char[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
baseChars[0] = Character.toUpperCase(ref.getBase());
|
|
||||||
baseChars[1] = Character.toUpperCase(convertIUPACByteToChar(read.getReadBases()[offset]));
|
|
||||||
|
|
||||||
if ( useSecondaryBase ) {
|
|
||||||
baseChars[2] = Character.toUpperCase(BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(((byte[]) read.getAttribute("SQ"))[offset])));
|
|
||||||
}
|
|
||||||
|
|
||||||
Pair<String,char[]> confusionContext;
|
|
||||||
|
|
||||||
if ( read.getReadNegativeStrandFlag() ) {
|
|
||||||
confusionContext = new Pair<String,char[]>(refRev, baseChars);
|
|
||||||
} else {
|
|
||||||
confusionContext = new Pair<String,char[]>(refFor, baseChars);
|
|
||||||
}
|
|
||||||
|
|
||||||
return confusionContext;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private char convertIUPACByteToChar(byte iupacBase) {
|
|
||||||
char compBase;
|
|
||||||
|
|
||||||
switch (iupacBase) {
|
|
||||||
case 'A':
|
|
||||||
case 'a': compBase = 'A'; break;
|
|
||||||
case 'C':
|
|
||||||
case 'c': compBase = 'C'; break;
|
|
||||||
case 'G':
|
|
||||||
case 'g': compBase = 'G'; break;
|
|
||||||
case 'T':
|
|
||||||
case 't': compBase = 'T'; break;
|
|
||||||
default: compBase = '.'; break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return compBase;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String makeHeader() {
|
|
||||||
|
|
||||||
String output = "Reference_Base\tObserved_Base";
|
|
||||||
|
|
||||||
if ( useSecondaryBase ) {
|
|
||||||
output = output + "\tSecondary_Base";
|
|
||||||
}
|
|
||||||
|
|
||||||
for ( int i = 0; i < nPreviousBases; i ++ ) {
|
|
||||||
output = String.format("%s\t%s", output, "Previous_Base_"+Integer.toString(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
output = String.format("%s\t%s\t%s\t%s", output, "Hash", "N_Observations", "As_Proportion");
|
|
||||||
|
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -204,161 +227,73 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Referen
|
||||||
|
|
||||||
class BaseTransitionTable {
|
class BaseTransitionTable {
|
||||||
|
|
||||||
final int N_BASES = 4;
|
private int[][] table;
|
||||||
final int A_OFFSET = 0;
|
private List<Comparable> conditions;
|
||||||
final int C_OFFSET = 1;
|
|
||||||
final int G_OFFSET = 2;
|
|
||||||
final int T_OFFSET = 3;
|
|
||||||
|
|
||||||
private BaseStringHash strHash;
|
public BaseTransitionTable(List<Comparable> conditions) {
|
||||||
protected int[][] confusionTable;
|
table = new int[BaseUtils.BASES.length][BaseUtils.BASES.length];
|
||||||
protected boolean useSecondaryBase;
|
for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) {
|
||||||
|
for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) {
|
||||||
|
table[i][j]=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public BaseTransitionTable( int prevBaseStringLength, boolean useSecondaryBase ) {
|
this.conditions = conditions;
|
||||||
this.useSecondaryBase = useSecondaryBase;
|
}
|
||||||
|
|
||||||
if ( useSecondaryBase ) {
|
public boolean conditionsMatch(Object obj) {
|
||||||
strHash = new BaseStringHash(prevBaseStringLength+2);
|
if ( obj == null ) {
|
||||||
|
return false;
|
||||||
|
} else if ( ! (obj instanceof List) ) {
|
||||||
|
return false;
|
||||||
|
} else if ( this.numConditions() != ((List)obj).size() ){
|
||||||
|
return false;
|
||||||
} else {
|
} else {
|
||||||
strHash = new BaseStringHash(prevBaseStringLength+1);
|
boolean eq = true;
|
||||||
|
ListIterator thisIter = this.getConditionIterator();
|
||||||
|
ListIterator thatIter = ((List)obj).listIterator();
|
||||||
|
|
||||||
|
while ( thisIter.hasNext() ) {
|
||||||
|
eq = eq && thisIter.next().equals(thatIter.next());
|
||||||
}
|
}
|
||||||
|
|
||||||
confusionTable = new int[strHash.hashSize()][N_BASES];
|
return eq;
|
||||||
for ( int i = 0; i < strHash.hashSize(); i ++ ) {
|
|
||||||
for ( int j = 0; j < N_BASES; j ++ ) {
|
|
||||||
confusionTable[i][j] = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void update( Pair<String, char[]> confusionContext ) {
|
|
||||||
//TODO -- THIS [done]
|
|
||||||
String context = confusionContext.getFirst();
|
|
||||||
char[] bases = confusionContext.getSecond();
|
|
||||||
// secondary base is a part of the hash
|
|
||||||
if ( useSecondaryBase ) {
|
|
||||||
context = context + bases[2] + bases[1];
|
|
||||||
} else {
|
|
||||||
context = context + bases[1];
|
|
||||||
}
|
|
||||||
System.out.println(context);
|
|
||||||
confusionTable[strHash.hash(context)][strHash.hash(bases[0])] ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void print( PrintStream out ) {
|
public void print( PrintStream out ) {
|
||||||
long totalCounts = countOverTable();
|
|
||||||
|
|
||||||
for ( int hash = 0; hash < strHash.maxHash(); hash ++ ) {
|
for ( char observedBase : BaseUtils.BASES ) {
|
||||||
for ( int ref = 0; ref < N_BASES; ref ++ ) {
|
for ( char refBase : BaseUtils.BASES ) {
|
||||||
char[] contextBases = strHash.inverse(hash).toCharArray();
|
String outString = observedBase+"\t"+refBase;
|
||||||
String output = Character.toString(strHash.invert(ref));
|
for ( Comparable c : conditions ) {
|
||||||
for ( int b = contextBases.length-1; b > 0; b ++ ) {
|
outString = outString+"\t"+c.toString();
|
||||||
output = output + "\t" + Character.toString(contextBases[b]);
|
|
||||||
}
|
}
|
||||||
|
out.printf("%s\t%d%n",outString,table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)]);
|
||||||
output = String.format("%s\t%f\t%d\t%f",output,hash+(double)ref/4,confusionTable[hash][ref], ((double)confusionTable[hash][ref])/totalCounts);
|
|
||||||
out.printf("%s%n",output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public long countOverTable() {
|
public void update(char observedBase, char refBase ) {
|
||||||
long count = 0l;
|
table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)] ++;
|
||||||
for (int hash = 0; hash < strHash.maxHash(); hash++ ) {
|
|
||||||
for ( int ref = 0; ref < N_BASES; ref ++ ) {
|
|
||||||
count += confusionTable[hash][ref];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return count;
|
public void update(byte observed, char ref) {
|
||||||
|
update( (char) observed, ref);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
public int numConditions() {
|
||||||
|
return conditions.size();
|
||||||
class BaseStringHash {
|
|
||||||
|
|
||||||
// character-level mappings and inverses for
|
|
||||||
// recursive hash
|
|
||||||
|
|
||||||
final int A_HASH = 0;
|
|
||||||
final int C_HASH = 1;
|
|
||||||
final int G_HASH = 2;
|
|
||||||
final int T_HASH = 3;
|
|
||||||
final char INV_0 = 'A';
|
|
||||||
final char INV_1 = 'C';
|
|
||||||
final char INV_2 = 'G';
|
|
||||||
final char INV_3 = 'T';
|
|
||||||
|
|
||||||
int stringLength;
|
|
||||||
|
|
||||||
public BaseStringHash( int stringLength ) {
|
|
||||||
this.stringLength = stringLength;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int maxHash() {
|
private Comparable getCondition(int offset) {
|
||||||
return hashSize()-1;
|
return conditions.get(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int hashSize() {
|
private ListIterator getConditionIterator() {
|
||||||
return (int) Math.round(Math.pow(4,stringLength+1));
|
return conditions.listIterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int hash( char b ) {
|
|
||||||
switch(b) {
|
|
||||||
case 'A':
|
|
||||||
return A_HASH;
|
|
||||||
case 'C':
|
|
||||||
return C_HASH;
|
|
||||||
case 'G':
|
|
||||||
return G_HASH;
|
|
||||||
case 'T':
|
|
||||||
return T_HASH;
|
|
||||||
default:
|
|
||||||
throw new StingException("Incorrect base type sent to base hashing function, was: "+b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int hash ( String s ) {
|
|
||||||
return recursiveHash(s,0);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int recursiveHash( String s, int offset ) {
|
|
||||||
// System.out.println(s+"\t"+offset);
|
|
||||||
if ( offset == s.length() ) {
|
|
||||||
return 0;
|
|
||||||
} else {
|
|
||||||
return (int) Math.round(hash(s.charAt(offset))*Math.pow(4,s.length()-offset-1)) + recursiveHash(s, offset+1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String inverse( int h ) {
|
|
||||||
return recursiveInverse(h, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
public char invert( int h ) {
|
|
||||||
switch(h) {
|
|
||||||
case 0:
|
|
||||||
return INV_0;
|
|
||||||
case 1:
|
|
||||||
return INV_1;
|
|
||||||
case 2:
|
|
||||||
return INV_2;
|
|
||||||
case 3:
|
|
||||||
return INV_3;
|
|
||||||
default:
|
|
||||||
throw new StingException("Non-primitive base-string hash code sent to invert. Expected [0,1,2,3]; received "+ Integer.toString(h));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String recursiveInverse(int h, int k ) {
|
|
||||||
if ( h == 0 ) {
|
|
||||||
return "";
|
|
||||||
} else {
|
|
||||||
double quaternary = Math.pow(4,stringLength-k);
|
|
||||||
int coef = (int) Math.floor( h/quaternary );
|
|
||||||
return Character.toString(invert(coef)) + recursiveInverse(h - (int) Math.floor(quaternary), k+1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -409,7 +344,7 @@ class ReferenceContextWindow {
|
||||||
|
|
||||||
public String getForwardRefString() {
|
public String getForwardRefString() {
|
||||||
String ref = "";
|
String ref = "";
|
||||||
for ( ReferenceContext c : prevRefs.subList(0,nPrevBases+1) ) {
|
for ( ReferenceContext c : prevRefs.subList(0,nPrevBases) ) {
|
||||||
ref = ref + c.getBase();
|
ref = ref + c.getBase();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -418,7 +353,7 @@ class ReferenceContextWindow {
|
||||||
|
|
||||||
public String getReverseRefString() { // todo -- make sure we want to flip this done (yes we do)
|
public String getReverseRefString() { // todo -- make sure we want to flip this done (yes we do)
|
||||||
String ref = "";
|
String ref = "";
|
||||||
for ( int base = prevRefs.size()-1; base >= nPrevBases; base -- ) {
|
for ( int base = prevRefs.size()-1; base > nPrevBases; base -- ) {
|
||||||
ref = ref + prevRefs.get(base).getBase();
|
ref = ref + prevRefs.get(base).getBase();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import java.util.ArrayList;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -26,6 +27,33 @@ public class PoolUtils {
|
||||||
public static final int BASE_T_OFFSET = 3;
|
public static final int BASE_T_OFFSET = 3;
|
||||||
public static final int BASE_INDEXED_ARRAY_SIZE = 4;
|
public static final int BASE_INDEXED_ARRAY_SIZE = 4;
|
||||||
|
|
||||||
|
public static Pair<List<SAMRecord>, List<Integer>> splitReadsByIndels( List<SAMRecord> reads, List<Integer> offsets, boolean returnBases ) {
|
||||||
|
|
||||||
|
List<SAMRecord> baseReads = new ArrayList<SAMRecord>();
|
||||||
|
List<SAMRecord> indelReads = new ArrayList<SAMRecord>();
|
||||||
|
List<Integer> baseOffsets = new ArrayList<Integer>();
|
||||||
|
List<Integer> indelOffsets = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
|
||||||
|
for ( int r = 0; r < reads.size(); r ++ ) {
|
||||||
|
SAMRecord read = reads.get(r);
|
||||||
|
int offset = offsets.get(r);
|
||||||
|
if (BaseUtils.isRegularBase( (char) read.getReadBases()[offset] ) ) {
|
||||||
|
baseReads.add(read);
|
||||||
|
baseOffsets.add(offset);
|
||||||
|
} else {
|
||||||
|
indelReads.add(read);
|
||||||
|
indelOffsets.add(offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (returnBases) {
|
||||||
|
return new Pair<List<SAMRecord>,List<Integer>>(baseReads,baseOffsets);
|
||||||
|
} else {
|
||||||
|
return new Pair<List<SAMRecord>,List<Integer>>(indelReads,indelOffsets);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static ReadOffsetQuad splitReadsByReadDirection(List<SAMRecord> reads, List<Integer> offsets) {
|
public static ReadOffsetQuad splitReadsByReadDirection(List<SAMRecord> reads, List<Integer> offsets) {
|
||||||
ArrayList<SAMRecord> forwardReads;
|
ArrayList<SAMRecord> forwardReads;
|
||||||
ArrayList<SAMRecord> reverseReads;
|
ArrayList<SAMRecord> reverseReads;
|
||||||
|
|
|
||||||
|
|
@ -386,7 +386,7 @@ abstract public class BasicPileup implements Pileup {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class BaseCounts {
|
public static class BaseCounts {
|
||||||
int a, c, t, g;
|
public int a, c, t, g;
|
||||||
|
|
||||||
public BaseCounts(int a, int c, int t, int g) {
|
public BaseCounts(int a, int c, int t, int g) {
|
||||||
this.a = a;
|
this.a = a;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue