Initial graph-based reference implementation and alignment assessor. Not suitable for public use
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1921 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
31d143a841
commit
68fa6da788
|
|
@ -0,0 +1,252 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.graphalign;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.fasta.FastaReferenceWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
import org.jgrapht.graph.SimpleGraph;
|
||||
import org.jgrapht.graph.SimpleDirectedGraph;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class GraphReferenceAssessor extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName="graphFile", shortName="GF", doc="", required=true)
|
||||
String graphFile = null;
|
||||
ObjectInputStream graphSerialStream = null;
|
||||
|
||||
@Argument(fullName="MAX", shortName="MAX", doc="", required=false)
|
||||
int MAXREADS = -1;
|
||||
|
||||
@Argument(fullName="ignore0MM", shortName="I0", doc="", required=false)
|
||||
boolean IGNORE_0_MM = false;
|
||||
|
||||
@Argument(fullName="DEBUG", shortName="DB", doc="", required=false)
|
||||
int DEBUG_LEVEL = 0;
|
||||
|
||||
static boolean DEBUG = false;
|
||||
static boolean DEBUG2 = false; // crazy level
|
||||
|
||||
@Argument(fullName="read", doc="", required=false)
|
||||
String onlyDoRead = null;
|
||||
|
||||
ReferenceGraph graphRef = null;
|
||||
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
DEBUG = DEBUG_LEVEL > 0;
|
||||
DEBUG2 = DEBUG_LEVEL > 1; // crazy level
|
||||
|
||||
try {
|
||||
logger.info("Reading graph reference " + graphFile );
|
||||
graphSerialStream = new ObjectInputStream( new FileInputStream( graphFile ) );
|
||||
graphRef = (ReferenceGraph)graphSerialStream.readObject();
|
||||
graphRef.setDebugPrinting(DEBUG);
|
||||
graphRef.validateGraph();
|
||||
logger.info(graphRef.toBriefString());
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Couldn't open file " + graphFile, e);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Couldn't write to file " + graphFile, e);
|
||||
} catch ( ClassNotFoundException e ) {
|
||||
throw new StingException("Couldn't read ReferenceGraph from file " + graphFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
private static MismatchCounter countMismatches(byte[] ref, int refOffset, byte[] bases, byte[] quals, int basesOffset, int length) {
|
||||
MismatchCounter mm = new MismatchCounter();
|
||||
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
byte rawRefBase = ref[i + refOffset];
|
||||
byte rawReadBase = bases[i + basesOffset];
|
||||
int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawRefBase);
|
||||
int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase);
|
||||
|
||||
boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase;
|
||||
if ( mmP ) {
|
||||
mm.nMM++;
|
||||
mm.qSum += quals != null ? quals[i + basesOffset] : 0;
|
||||
}
|
||||
|
||||
if ( GraphReferenceAssessor.DEBUG2 )
|
||||
System.out.printf("%s%d %c %c %s %b%n", Utils.dupString(' ', basesOffset + 2), basesOffset, (char)rawRefBase, (char)rawReadBase, mm, mmP);
|
||||
}
|
||||
|
||||
return mm;
|
||||
}
|
||||
|
||||
private static MismatchCounter countMismatches(byte[] ref, byte[] bases, byte[] quals) {
|
||||
return countMismatches(ref, 0, bases, quals, 0, bases.length);
|
||||
}
|
||||
|
||||
private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Collection<Fragment> frags, int fragOffset, byte[] bases, byte[] quals, int readOffset ) {
|
||||
if ( frags.size() == 0 )
|
||||
throw new RuntimeException("Fragment list is empty!");
|
||||
|
||||
MismatchCounter minNMM = MismatchCounter.MAX_VALUE;
|
||||
|
||||
for ( Fragment next : frags ) {
|
||||
MismatchCounter recNMM = countMismatchesOnGraph( graph, next, 0, bases, quals, readOffset );
|
||||
minNMM = minNMM.min( recNMM );
|
||||
}
|
||||
|
||||
return minNMM;
|
||||
}
|
||||
|
||||
private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Fragment frag, int fragOffset, byte[] bases, byte[] quals, int readOffset ) {
|
||||
if ( GraphReferenceAssessor.DEBUG )System.out.printf("%sfrag %s -> %d%n", Utils.dupString(' ', readOffset + 2), frag, readOffset);
|
||||
|
||||
MismatchCounter mm = new MismatchCounter();
|
||||
|
||||
if ( readOffset < bases.length ) {
|
||||
int nRemainingBases = bases.length - readOffset;
|
||||
int cmpLength = frag.getBaseLengthFrom(fragOffset, nRemainingBases); // how many bases over in the fragment are we from the offset
|
||||
MismatchCounter fragMM = countMismatches(frag.getUnderlyingBases(), frag.getUnderlyingOffset() + fragOffset, bases, quals, readOffset, cmpLength);
|
||||
mm.add(fragMM);
|
||||
|
||||
// // still have some counting to do
|
||||
// for ( int i = 0; i < baseLength; i++ ) {
|
||||
// int fragBaseOffset = fragOffset + i;
|
||||
// int readBaseOffset = readOffset + i;
|
||||
//
|
||||
// byte rawFragBase = frag.getBase(fragBaseOffset);
|
||||
// byte rawReadBase = bases[readBaseOffset];
|
||||
// int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawFragBase);
|
||||
// int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase);
|
||||
//
|
||||
// boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase;
|
||||
// if ( mmP ) nMM++;
|
||||
|
||||
if ( nRemainingBases > cmpLength ) {
|
||||
MismatchCounter recMM = countMismatchesOnGraph( graph, graph.outgoingFragments(frag), 0, bases, quals, readOffset + cmpLength );
|
||||
mm.add(recMM);
|
||||
}
|
||||
}
|
||||
|
||||
if ( GraphReferenceAssessor.DEBUG ) System.out.printf("%s=> %s%n", Utils.dupString(' ', readOffset + 2), mm);
|
||||
return mm;
|
||||
}
|
||||
|
||||
private static MismatchCounter countMismatchesOnGraph(ReferenceGraph graph, SAMRecord read) {
|
||||
if ( GraphReferenceAssessor.DEBUG ) System.out.printf("countMismatchesOnGraph( read=%s%n", read.getReadName());
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
|
||||
MismatchCounter minNMM = MismatchCounter.MAX_VALUE;
|
||||
|
||||
for ( Fragment frag : graph.getStartingFragment(loc) ) {
|
||||
int fragOffset = frag.getFragOffsetFrom(loc); // how many bases over in the fragment are we from the offset
|
||||
|
||||
if ( GraphReferenceAssessor.DEBUG )
|
||||
System.out.printf(" countMismatchesOnGraph frag=%s loc=%s bases=%s offset=%d%n", frag, loc, read.getReadString(), fragOffset);
|
||||
|
||||
MismatchCounter recNMM = countMismatchesOnGraph(graph, frag, fragOffset, read.getReadBases(), read.getBaseQualities(), 0);
|
||||
minNMM = minNMM.min( recNMM );
|
||||
}
|
||||
|
||||
return minNMM;
|
||||
}
|
||||
|
||||
public Integer map(char[] refArg, SAMRecord read) {
|
||||
|
||||
if ( MAXREADS-- == 0 ) {
|
||||
System.exit(0);
|
||||
} else if ( onlyDoRead != null && ! read.getReadName().equals(onlyDoRead) ) {
|
||||
;
|
||||
} else if ( ! read.getReadUnmappedFlag() && read.getCigar().numCigarElements() == 1 ) {
|
||||
try {
|
||||
byte[] ref = BaseUtils.charSeq2byteSeq(refArg);
|
||||
// we're all XM
|
||||
int nMMFromRead = (Short)read.getAttribute("NM");
|
||||
MismatchCounter nAlignedMM = countMismatches(ref, read.getReadBases(), read.getBaseQualities());
|
||||
if ( ! IGNORE_0_MM || nAlignedMM.nMM > 0 ) {
|
||||
MismatchCounter nGraphMM = countMismatchesOnGraph(graphRef, read);
|
||||
MismatchCounter deltaMM = nAlignedMM.minus(nGraphMM);
|
||||
|
||||
out.printf("%50s with %5s at %10s: mismatches: %3d (delta %3d) -- %3d %3d -- %3d %3d -- delta %3d %3d%n",
|
||||
read.getReadName(), read.getCigarString(), GenomeLocParser.createGenomeLoc(read),
|
||||
nMMFromRead, nMMFromRead - nAlignedMM.nMM,
|
||||
nAlignedMM.nMM, nAlignedMM.qSum,
|
||||
nGraphMM.nMM, nGraphMM.qSum,
|
||||
deltaMM.nMM, deltaMM.qSum);
|
||||
|
||||
if ( deltaMM.nMM < 0 || deltaMM.qSum < 0 )
|
||||
throw new StingException(read.getReadName() + " is miscalculated");
|
||||
}
|
||||
} catch ( Exception e ) {
|
||||
System.out.printf("Exception at %s at %s%n", read.getReadName(), GenomeLocParser.createGenomeLoc(read));
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
; // don't do anything
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* reduceInit is called once before any calls to the map function. We use it here to setup the output
|
||||
* bam file, if it was specified on the command line
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer readScore, Integer data) {
|
||||
return data + readScore;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer data) {
|
||||
out.printf(data.toString());
|
||||
}
|
||||
}
|
||||
|
||||
class MismatchCounter {
|
||||
int nMM = 0;
|
||||
int qSum = 0;
|
||||
|
||||
public static MismatchCounter MAX_VALUE = new MismatchCounter(Integer.MAX_VALUE, Integer.MAX_VALUE );
|
||||
|
||||
public MismatchCounter() {}
|
||||
|
||||
public MismatchCounter(int nMM, int qSum) {
|
||||
this.nMM = nMM;
|
||||
this.qSum = qSum;
|
||||
}
|
||||
|
||||
public void add(MismatchCounter that) {
|
||||
this.nMM += that.nMM;
|
||||
this.qSum += that.qSum;
|
||||
}
|
||||
|
||||
public MismatchCounter min(MismatchCounter that) {
|
||||
int cmpQSum = Integer.valueOf(this.qSum).compareTo(that.qSum);
|
||||
if ( cmpQSum < 0 ) { return this; }
|
||||
else if ( cmpQSum > 0 ) { return that; }
|
||||
else if ( this.nMM < that.nMM ) { return this; }
|
||||
else if ( this.nMM > that.nMM ) { return that; }
|
||||
else { return this; }
|
||||
}
|
||||
|
||||
public MismatchCounter minus(MismatchCounter that) {
|
||||
return new MismatchCounter(this.nMM - that.nMM, this.qSum - that.qSum);
|
||||
}
|
||||
|
||||
public String toString() { return String.format("[MM %d %d]", nMM, qSum); }
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.graphalign;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.fasta.FastaReferenceWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
import org.jgrapht.graph.SimpleGraph;
|
||||
import org.jgrapht.graph.SimpleDirectedGraph;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
|
||||
@WalkerName("GraphReferenceBuilder")
|
||||
@Requires(value={DataSource.REFERENCE})
|
||||
public class GraphReferenceBuilder extends RefWalker<Integer, Integer> {
|
||||
@Argument(fullName="graphFile", shortName="GF", doc="", required=true)
|
||||
String graphFile = null;
|
||||
|
||||
@Argument(fullName="DEBUG", shortName="DB", doc="", required=false)
|
||||
boolean DEBUG = false;
|
||||
|
||||
@Argument(fullName="VALIDATE", shortName="VD", doc="", required=false)
|
||||
boolean VALIDATE_GRAPH = false;
|
||||
|
||||
@Argument(fullName="printFrequency", shortName="F", doc="", required=false)
|
||||
int printFrequency = 10000;
|
||||
|
||||
ObjectOutputStream graphSerialStream = null;
|
||||
|
||||
ReferenceGraph graphRef = null;
|
||||
ReferenceSequenceFile flatReferenceFile = null;
|
||||
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
graphRef = new ReferenceGraph(DEBUG);
|
||||
|
||||
try {
|
||||
graphSerialStream = new ObjectOutputStream( new FileOutputStream( graphFile ) );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Couldn't open file " + graphFile, e);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Couldn't write to file " + graphFile, e);
|
||||
}
|
||||
|
||||
flatReferenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.getToolkit().getArguments().referenceFile);
|
||||
|
||||
ReferenceSequence refSeq = flatReferenceFile.nextSequence();
|
||||
do {
|
||||
//logger.info("Read " + refSeq);
|
||||
graphRef.bindRefenceSequence(refSeq);
|
||||
logger.info(String.format("contig %s has length %d", refSeq.getName(), refSeq.length()));
|
||||
refSeq = flatReferenceFile.nextSequence();
|
||||
} while ( refSeq != null );
|
||||
|
||||
System.out.println(graphRef.toBriefString());
|
||||
}
|
||||
|
||||
int counter = printFrequency;
|
||||
public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) {
|
||||
// if ( context.getLocation().getStart() == 59384 ) {
|
||||
// try {
|
||||
// Thread.currentThread().sleep(5000);
|
||||
// } catch (InterruptedException e) {
|
||||
// ;
|
||||
// }
|
||||
// }
|
||||
|
||||
boolean alreadyAddedAtThisLoc = false;
|
||||
for ( ReferenceOrderedDatum rod : rodData.getAllRods() ) {
|
||||
if ( rod instanceof Variation && ! alreadyAddedAtThisLoc ) {
|
||||
// if we have multiple variants at a locus, just take the first damn one we see for now
|
||||
Variation variant = (Variation) rod;
|
||||
// todo -- getAlternativeBases should be getAlleles()
|
||||
GenomeLoc loc = variant.getLocation();
|
||||
String[] allAllelesList = null; // variant.getAlternateBases().split(""); // todo fixme
|
||||
if ( allAllelesList.length >= 3 ) { // bad dbSNP format :-(
|
||||
List<String> alleles = Arrays.asList(allAllelesList).subList(1,3);
|
||||
//logger.info(String.format("Adding %s %s", loc, alleles));
|
||||
graphRef.addVariation(variant, loc, alleles);
|
||||
//logger.info(String.format(" Added %s %s", loc, alleles));
|
||||
alreadyAddedAtThisLoc = true;
|
||||
if ( counter-- == 0 ) {
|
||||
logger.info(String.format("Added %s %s %s", loc, alleles, graphRef.toBriefString()));
|
||||
counter = printFrequency;
|
||||
if ( VALIDATE_GRAPH )
|
||||
graphRef.validateGraph();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// todo -- graph should be the reduce result
|
||||
public Integer reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
super.onTraversalDone(sum);
|
||||
try {
|
||||
graphSerialStream.writeObject(graphRef);
|
||||
graphSerialStream.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Couldn't write to file " + graphFile, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,420 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.graphalign;
|
||||
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
import org.jgrapht.graph.SimpleDirectedGraph;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.Serializable;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.ObjectInputStream;
|
||||
|
||||
import edu.mit.broad.picard.util.IntervalTree;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Oct 22, 2009
|
||||
* Time: 2:15:54 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class ReferenceGraph extends SimpleDirectedGraph<Fragment, DefaultEdge> implements Serializable {
|
||||
final private static boolean USE_IT = true;
|
||||
final private static boolean THROW_ERRORS_ON_BAD_INPUTS = false;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
int nSkippedIndels = 0;
|
||||
int nSkippedBecauseOfContinguousVariation = 0;
|
||||
int nBadPolymorphisms = 0;
|
||||
int nMultiStateAlleles = 0;
|
||||
|
||||
GenomeLoc initialLoc = null;
|
||||
|
||||
private transient IntervalTree<Fragment> loc2Fragment = new IntervalTree<Fragment>();
|
||||
|
||||
public ReferenceGraph(boolean printDebuggingInfo) {
|
||||
super(DefaultEdge.class);
|
||||
DEBUG = printDebuggingInfo;
|
||||
}
|
||||
|
||||
public ReferenceGraph() {
|
||||
this(false);
|
||||
}
|
||||
|
||||
public void setDebugPrinting(boolean enable) {
|
||||
this.DEBUG = enable;
|
||||
}
|
||||
|
||||
public void bindRefenceSequence(ReferenceSequence seq) {
|
||||
GenomeLoc refSeqLoc = GenomeLocParser.createGenomeLoc(seq.getContigIndex(), 1, seq.length());
|
||||
String refString = StringUtil.bytesToString(seq.getBases()).toUpperCase();
|
||||
Fragment frag = new Fragment(refSeqLoc, 1, StringUtil.stringToBytes(refString));
|
||||
addFragment(frag);
|
||||
initialLoc = refSeqLoc;
|
||||
}
|
||||
|
||||
private void addFragmentToIntervalTree(Fragment frag) {
|
||||
loc2Fragment.put((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop(), frag);
|
||||
}
|
||||
|
||||
private void addFragment(Fragment frag) {
|
||||
addFragmentToIntervalTree(frag);
|
||||
addVertex(frag);
|
||||
}
|
||||
|
||||
private void removeFragment(Fragment frag) {
|
||||
loc2Fragment.remove((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop());
|
||||
removeVertex(frag);
|
||||
}
|
||||
|
||||
public void validateGraph() {
|
||||
for ( Fragment v : this.vertexSet() ) {
|
||||
if ( this.inDegreeOf(v) == 0 && v.getLocation().getStart() != initialLoc.getStart() ) {
|
||||
throw new StingException(String.format("Fragment %s has no incoming edges but isn't at the start of the contig %s", v, initialLoc));
|
||||
}
|
||||
if ( this.outDegreeOf(v) == 0 && v.getLocation().getStop() != initialLoc.getStop() ) {
|
||||
throw new StingException(String.format("Fragment %s has no outgoing edges but isn't at the end of the contig %s", v, initialLoc));
|
||||
}
|
||||
}
|
||||
|
||||
//System.out.printf("Passed validation: %s%n", this.toBriefString());
|
||||
}
|
||||
|
||||
private void rebuildIntervalTree() {
|
||||
if ( DEBUG ) System.out.printf("rebuilding IntervalTree()%n");
|
||||
for ( Fragment v : this.vertexSet() ) {
|
||||
if ( DEBUG ) System.out.printf(" adding interval tree: %s%n", v);
|
||||
addFragmentToIntervalTree(v);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean allelesAreInExcisedFragment(Fragment cut, List<String> alleles) {
|
||||
boolean foundRef = false;
|
||||
for ( String allele : alleles ) {
|
||||
if ( allele.equals(cut.getBases()) )
|
||||
foundRef = true;
|
||||
}
|
||||
|
||||
if ( ! foundRef && THROW_ERRORS_ON_BAD_INPUTS )
|
||||
throw new StingException(String.format("Polymorphic alleles %s do not contain the reference sequence %s", alleles, cut.getBases()));
|
||||
|
||||
return foundRef;
|
||||
}
|
||||
|
||||
public void addVariation(Variation variant, GenomeLoc loc, List<String> alleles) {
|
||||
if ( DEBUG ) System.out.printf("addVariation(%s, %s)%n", loc, alleles);
|
||||
//validateGraph();
|
||||
|
||||
if ( variant.isSNP() ) {
|
||||
Fragment frag = getContainingFragment(loc);
|
||||
|
||||
if ( frag == null ) {
|
||||
nMultiStateAlleles++;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( ! allelesAreInExcisedFragment(subFragment(frag, loc, 1), alleles)) {
|
||||
nBadPolymorphisms++;
|
||||
return;
|
||||
}
|
||||
|
||||
List<Fragment> split = exciseFragment(frag, loc);
|
||||
if ( split != null ) {
|
||||
Fragment left = split.get(0);
|
||||
Fragment cut = split.get(1);
|
||||
Fragment right = split.get(2);
|
||||
|
||||
if ( DEBUG ) System.out.printf(" cutFrag(%s, %s)%n", loc, cut);
|
||||
|
||||
for ( String allele : alleles ) {
|
||||
byte[] bases = StringUtil.stringToBytes(allele);
|
||||
double freq = 1.0 / alleles.size();
|
||||
Fragment alleleFrag = new Fragment(loc, freq, 0, bases.length, bases);
|
||||
if ( DEBUG ) System.out.printf(" Creating allele fragment %s%n", alleleFrag);
|
||||
addFragment(alleleFrag);
|
||||
if ( left != null ) addEdge(left, alleleFrag);
|
||||
if ( right != null ) addEdge(alleleFrag, right);
|
||||
}
|
||||
} else {
|
||||
nSkippedBecauseOfContinguousVariation++;
|
||||
}
|
||||
} else {
|
||||
nSkippedIndels++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Fragment subFragment(Fragment frag, GenomeLoc loc, double freq ) {
|
||||
return new Fragment(loc, 1, frag.getUnderlyingBases());
|
||||
}
|
||||
|
||||
public List<Fragment> exciseFragment(Fragment frag, GenomeLoc loc) {
|
||||
if ( DEBUG ) System.out.printf(" exciseFragment(%s, %s)%n", frag, loc);
|
||||
GenomeLoc fragLoc = frag.getLocation();
|
||||
|
||||
Fragment cut = subFragment(frag, loc, 1);
|
||||
|
||||
Set<DefaultEdge> inToFrag = incomingEdgesOf(frag);
|
||||
Set<DefaultEdge> outOfFrag = outgoingEdgesOf(frag);
|
||||
|
||||
Fragment left = null;
|
||||
if ( fragLoc.getStart() == loc.getStart() ) {
|
||||
if ( ! inToFrag.isEmpty() ) {
|
||||
if ( THROW_ERRORS_ON_BAD_INPUTS )
|
||||
throw new StingException(String.format("Attempting to create a variation at the start of a fragment %s %s", frag, loc));
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
GenomeLoc leftLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), fragLoc.getStart(), loc.getStart()-1);
|
||||
left = new Fragment(leftLoc, 1, frag.getUnderlyingBases());
|
||||
addFragment(left);
|
||||
|
||||
for ( DefaultEdge e : inToFrag ) {
|
||||
addEdge(getEdgeSource(e), left);
|
||||
}
|
||||
|
||||
removeAllEdges(inToFrag);
|
||||
}
|
||||
|
||||
Fragment right = null;
|
||||
if ( fragLoc.getStop() == loc.getStop() ) {
|
||||
if ( ! outOfFrag.isEmpty() ) {
|
||||
throw new StingException(String.format("Attempting to create a variation at the end of a fragment %s %s", frag, loc));
|
||||
}
|
||||
} else {
|
||||
GenomeLoc rightLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), loc.getStop()+1, fragLoc.getStop());
|
||||
right = new Fragment(rightLoc, 1, frag.getUnderlyingBases());
|
||||
addFragment(right);
|
||||
|
||||
for ( DefaultEdge e : outOfFrag ) {
|
||||
addEdge(right, getEdgeTarget(e));
|
||||
}
|
||||
|
||||
removeAllEdges(outOfFrag);
|
||||
}
|
||||
|
||||
if ( DEBUG ) System.out.printf(" removing %s%n", frag);
|
||||
removeFragment(frag);
|
||||
if ( DEBUG ) System.out.printf(" returning left=%s right=%s%n", left, right);
|
||||
return Arrays.asList(left, cut, right);
|
||||
}
|
||||
|
||||
public Fragment getContainingFragment(GenomeLoc loc) {
|
||||
Fragment frag = USE_IT ? getContainingFragmentIT(loc) : getContainingFragmentG(loc);
|
||||
|
||||
|
||||
if ( frag == null ) {
|
||||
if ( THROW_ERRORS_ON_BAD_INPUTS )
|
||||
throw new StingException("No spanning fragment was found for " + loc);
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else if ( frag.getLocation().getStart() > loc.getStart() || frag.getLocation().getStop() < loc.getStop() )
|
||||
throw new StingException("BUG: bad spanning fragment found for " + loc + " was " + frag.getLocation() );
|
||||
else
|
||||
return frag;
|
||||
}
|
||||
|
||||
public Fragment getContainingFragmentG(GenomeLoc loc) {
|
||||
for ( Fragment v : this.vertexSet() ) {
|
||||
if ( v.getLocation().containsP(loc) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public Fragment getContainingFragmentIT(GenomeLoc loc) {
|
||||
IntervalTree.Node<Fragment> node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStop());
|
||||
if ( node == null )
|
||||
return null;
|
||||
else
|
||||
return node.getValue();
|
||||
}
|
||||
|
||||
public Collection<Fragment> getStartingFragment(GenomeLoc loc) {
|
||||
Collection<Fragment> frags = USE_IT ? getStartingFragmentIT(loc) : getStartingFragmentG(loc);
|
||||
//Collection<Fragment> frags = getStartingFragmentTest(loc);
|
||||
|
||||
if ( frags == null || frags.size() == 0 )
|
||||
throw new StingException("No fragment contains location start of " + loc);
|
||||
if ( frags.size() == 1 && MathUtils.compareDoubles(frags.iterator().next().getFrequency(), 1.0) != 0 ) {
|
||||
Fragment bad = frags.iterator().next();
|
||||
throw new StingException(String.format("Only one fragment was found but it's frequency < 1 %s with %e", bad, bad.getFrequency()));
|
||||
}
|
||||
else
|
||||
return frags;
|
||||
}
|
||||
|
||||
public Collection<Fragment> getStartingFragmentTest(GenomeLoc loc) {
|
||||
Collection<Fragment> fragsFromIT = getStartingFragmentIT(loc);
|
||||
Collection<Fragment> fragsFromG = getStartingFragmentG(loc);
|
||||
|
||||
if ( fragsFromIT.size() != fragsFromG.size() ) {
|
||||
throw new StingException(String.format("Fragment sizes differ %d from IntervalTree, %d from graph", fragsFromIT.size(), fragsFromG.size()));
|
||||
}
|
||||
|
||||
return USE_IT && false ? fragsFromIT : fragsFromG;
|
||||
}
|
||||
|
||||
public Collection<Fragment> getStartingFragmentIT(GenomeLoc loc) {
|
||||
Collection<Fragment> frags = new HashSet<Fragment>();
|
||||
|
||||
Iterator<IntervalTree.Node<Fragment>> it = loc2Fragment.overlappers((int)loc.getStart(), (int)loc.getStart());
|
||||
IntervalTree.Node<Fragment> node = null;
|
||||
while ( it.hasNext() ) {
|
||||
node = it.next();
|
||||
frags.add(node.getValue());
|
||||
}
|
||||
|
||||
// todo -- painful bug work around -- should be removed
|
||||
if ( frags.size() == 1 && MathUtils.compareDoubles(node.getValue().getFrequency(), 1.0) != 0 ) {
|
||||
System.out.printf(">>> Using IT workaround at %s <<<%n", loc);
|
||||
return getStartingFragmentG(loc);
|
||||
}
|
||||
|
||||
return frags;
|
||||
// IntervalTree.Node<Fragment> node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStart());
|
||||
// if ( node == null )
|
||||
// return null;
|
||||
// else
|
||||
// return node.getValue();
|
||||
}
|
||||
|
||||
public Collection<Fragment> getStartingFragmentG(GenomeLoc loc) {
|
||||
Collection<Fragment> frags = new HashSet<Fragment>();
|
||||
for ( Fragment v : this.vertexSet() ) {
|
||||
//if ( v.getLocation().getStart() < loc.getStop() )
|
||||
// System.out.printf("Checking %s vs. %s%n", loc, v.getLocation());
|
||||
if ( v.getLocation().containsStartPosition(loc.getStart()) ) {
|
||||
// System.out.printf(" Adding %s%n", v.getLocation());
|
||||
frags.add(v);
|
||||
}
|
||||
}
|
||||
|
||||
return frags;
|
||||
}
|
||||
|
||||
public Set<Fragment> outgoingFragments( Fragment frag ) {
|
||||
Set<Fragment> outgoingFrags = new HashSet<Fragment>();
|
||||
|
||||
for ( DefaultEdge e : outgoingEdgesOf(frag) ) {
|
||||
outgoingFrags.add(getEdgeTarget(e));
|
||||
}
|
||||
|
||||
if ( outgoingFrags.size() == 0 && frag.getLocation().getStop() != initialLoc.getStop() ) {
|
||||
|
||||
}
|
||||
|
||||
return outgoingFrags;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder();
|
||||
|
||||
for ( Fragment v : this.vertexSet() ) {
|
||||
s.append(String.format("Fragment: %s%n", v.toString()));
|
||||
for ( DefaultEdge e : this.incomingEdgesOf(v) ) {
|
||||
s.append(String.format(" [IN FROM] %s%n", this.getEdgeSource(e)));
|
||||
}
|
||||
for ( DefaultEdge e : this.outgoingEdgesOf(v) ) {
|
||||
s.append(String.format(" [OUT TO ] %s%n", this.getEdgeTarget(e)));
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
public String toBriefString() {
|
||||
return String.format("GraphRef: %d fragments, %d edges, skipped %d contingous variants, %d indels, %d polymorphisms w/o ref allele, %d multi-state",
|
||||
this.vertexSet().size(), this.edgeSet().size(), nSkippedBecauseOfContinguousVariation, nSkippedIndels, nBadPolymorphisms, nMultiStateAlleles);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// serialization
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
|
||||
//always perform the default de-serialization first
|
||||
stream.defaultReadObject();
|
||||
loc2Fragment = new IntervalTree<Fragment>();
|
||||
rebuildIntervalTree();
|
||||
}
|
||||
}
|
||||
|
||||
class Fragment implements Serializable {
|
||||
GenomeLoc loc = null; // Index position of this fragment into the reference
|
||||
int offset, stop;
|
||||
double freq = -1;
|
||||
byte[] bases = null;
|
||||
|
||||
public Fragment( GenomeLoc loc, double freq, int offset, int stop, byte[] bases ) {
|
||||
this.loc = loc;
|
||||
this.freq = freq;
|
||||
this.bases = bases;
|
||||
this.offset = offset;
|
||||
this.stop = stop;
|
||||
}
|
||||
|
||||
public Fragment( GenomeLoc loc, double freq, byte[] bases ) {
|
||||
this(loc, freq, (int)loc.getStart()-1, (int)loc.getStop(), bases);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
//return String.format("%s:%.2f:%s", loc.toString(), getFrequency(), getBases());
|
||||
return String.format("%s:%.2f", loc.toString(), getFrequency());
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
public double getFrequency() {
|
||||
return freq;
|
||||
}
|
||||
|
||||
public int getUnderlyingOffset() { return offset; }
|
||||
public int getStop() { return stop; }
|
||||
public int getLength() { return getStop() - getUnderlyingOffset(); }
|
||||
|
||||
public byte[] getUnderlyingBases() {
|
||||
return bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* how many bases over in the fragment are we over in this fragment?
|
||||
*
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
public int getFragOffsetFrom(GenomeLoc loc) {
|
||||
// todo -- ignores contigs -- can we fix this?
|
||||
if ( getLocation().getStart() > loc.getStart() )
|
||||
throw new StingException("BUG: Request for offset from " + loc + " in frag at " + getLocation() + " but this is beyond the location of the fragment");
|
||||
return (int)(loc.getStart() - getLocation().getStart());
|
||||
}
|
||||
|
||||
public int getBaseLengthFrom( int fragOffset, int maxLength ) {
|
||||
int fragRemaining = getLength() - fragOffset;
|
||||
|
||||
if ( fragRemaining < 0 )
|
||||
throw new StingException("BUG: Request for length from offset " + fragOffset + " but this is longer than the fragment itself");
|
||||
|
||||
return Math.min(fragRemaining, maxLength);
|
||||
}
|
||||
|
||||
public byte getBase(int fragOffset) {
|
||||
return bases[getUnderlyingOffset() + fragOffset];
|
||||
}
|
||||
|
||||
public String getBases() {
|
||||
return StringUtil.bytesToString(getUnderlyingBases(), getUnderlyingOffset(), getLength());
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue