Remove support for BED file interval parsing in the GATK; it should all go through Tribble now. IndelRealigner no longer supports unordered interval input (which shouldn't have been used anyways). Temporarily commenting out serialization of arguments so that tests pass; this whole piece will be deleted soon anyways.
This commit is contained in:
parent
f7df8bdecc
commit
4a7e6fee3f
|
|
@ -25,14 +25,11 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
|
|
|||
|
|
@ -89,18 +89,18 @@ public class GATKArgumentCollection {
|
|||
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
|
||||
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
|
||||
*/
|
||||
@ElementList(required = false)
|
||||
//@ElementList(required = false)
|
||||
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> intervals = null;
|
||||
public List<IntervalBinding<Feature>> intervals = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
|
||||
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
|
||||
*/
|
||||
@ElementList(required = false)
|
||||
//@ElementList(required = false)
|
||||
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> excludeIntervals = null;
|
||||
public List<IntervalBinding<Feature>> excludeIntervals = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
|
||||
|
|
|
|||
|
|
@ -229,14 +229,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam")
|
||||
protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false;
|
||||
|
||||
/**
|
||||
* For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception.
|
||||
* Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted")
|
||||
protected boolean TARGET_NOT_SORTED = false;
|
||||
|
||||
/**
|
||||
* Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that
|
||||
* the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'),
|
||||
|
|
@ -366,30 +358,24 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
catch(FileNotFoundException ex) {
|
||||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex);
|
||||
}
|
||||
|
||||
if ( !TARGET_NOT_SORTED ) {
|
||||
|
||||
NwayIntervalMergingIterator merger = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>();
|
||||
// separate argument on semicolon first
|
||||
for (String fileOrInterval : intervalsFile.split(";")) {
|
||||
// if it's a file, add items to raw interval list
|
||||
if (IntervalUtils.isIntervalFile(fileOrInterval)) {
|
||||
merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) );
|
||||
} else {
|
||||
rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval));
|
||||
}
|
||||
NwayIntervalMergingIterator merger = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>();
|
||||
// separate argument on semicolon first
|
||||
for (String fileOrInterval : intervalsFile.split(";")) {
|
||||
// if it's a file, add items to raw interval list
|
||||
if (IntervalUtils.isIntervalFile(fileOrInterval)) {
|
||||
merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) );
|
||||
} else {
|
||||
rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval));
|
||||
}
|
||||
if ( ! rawIntervals.isEmpty() ) merger.add(rawIntervals.iterator());
|
||||
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
|
||||
intervals = merger;
|
||||
} else {
|
||||
// read in the whole list of intervals for cleaning
|
||||
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(),
|
||||
IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile)),
|
||||
IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
intervals = locs.iterator();
|
||||
}
|
||||
if ( ! rawIntervals.isEmpty() )
|
||||
merger.add(rawIntervals.iterator());
|
||||
|
||||
// prepare to read intervals one-by-one, as needed
|
||||
intervals = merger;
|
||||
|
||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||
|
||||
writerToUse = writer;
|
||||
|
|
|
|||
|
|
@ -1,104 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.bed;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaron
|
||||
* Date: Oct 5, 2009
|
||||
* Time: 5:46:45 PM
|
||||
*/
|
||||
public class BedParser {
|
||||
// the GATk operates as a one based location, bed files are 0 based
|
||||
static final int TO_ONE_BASED_ADDITION = 1;
|
||||
|
||||
// the buffered reader input
|
||||
private final BufferedReader mIn;
|
||||
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
// our array of locations
|
||||
private List<GenomeLoc> mLocations;
|
||||
|
||||
/**
|
||||
* parse a bed file, given it's location
|
||||
*
|
||||
* @param fl
|
||||
*/
|
||||
public BedParser(GenomeLocParser genomeLocParser,File fl) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
try {
|
||||
mIn = new BufferedReader(new FileReader(fl));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(fl, e);
|
||||
}
|
||||
mLocations = parseLocations();
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a bed file, given an input reader
|
||||
*
|
||||
* @param fl the bed file
|
||||
*/
|
||||
public BedParser(BufferedReader fl) {
|
||||
mIn = fl;
|
||||
mLocations = parseLocations();
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the locations
|
||||
*
|
||||
* @return a list of GenomeLocs, sorted and merged
|
||||
*/
|
||||
private List<GenomeLoc> parseLocations() {
|
||||
String line = null;
|
||||
List<GenomeLoc> locArray = new ArrayList<GenomeLoc>();
|
||||
try {
|
||||
while ((line = mIn.readLine()) != null) {
|
||||
locArray.add(parseLocation(genomeLocParser,line));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new UserException.MalformedFile("Unable to parse line in BED file.");
|
||||
}
|
||||
return locArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a single location
|
||||
*
|
||||
* @param line the line, as a string
|
||||
* @return a parsed genome loc
|
||||
*/
|
||||
public static GenomeLoc parseLocation(GenomeLocParser genomeLocParser,String line) {
|
||||
String contig;
|
||||
int start;
|
||||
int stop;
|
||||
try {
|
||||
String parts[] = line.split("\\s+");
|
||||
contig = parts[0];
|
||||
start = Integer.valueOf(parts[1]) + TO_ONE_BASED_ADDITION;
|
||||
stop = Integer.valueOf(parts[2]); // the ending point is an open interval
|
||||
} catch (Exception e) {
|
||||
throw new UserException.MalformedFile("Unable to process bed file line = " + line, e);
|
||||
}
|
||||
|
||||
// we currently drop the rest of the bed record, which can contain names, scores, etc
|
||||
return genomeLocParser.createGenomeLoc(contig, start, stop, true);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* return the sorted, and merged (for overlapping regions)
|
||||
*
|
||||
* @return an arraylist
|
||||
*/
|
||||
public List<GenomeLoc> getLocations() {
|
||||
return mLocations;
|
||||
}
|
||||
}
|
||||
|
|
@ -61,14 +61,7 @@ public class IntervalFileMergingIterator implements Iterator<GenomeLoc> {
|
|||
|
||||
try {
|
||||
XReadLines reader = new XReadLines(f);
|
||||
|
||||
if (f.getName().toUpperCase().endsWith(".BED")) {
|
||||
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator(),
|
||||
StringToGenomeLocIteratorAdapter.FORMAT.BED ) ) ;
|
||||
} else {
|
||||
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator(),
|
||||
StringToGenomeLocIteratorAdapter.FORMAT.GATK ) ) ;
|
||||
}
|
||||
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator() ));
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(f, e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,8 +8,8 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
|
|
@ -94,9 +94,9 @@ public class IntervalUtils {
|
|||
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||
|
||||
// case: BED file
|
||||
if (file_name.toUpperCase().endsWith(".BED")) {
|
||||
BedParser parser = new BedParser(glParser,inputFile);
|
||||
ret.addAll(parser.getLocations());
|
||||
if ( file_name.toUpperCase().endsWith(".BED") ) {
|
||||
// this is now supported in Tribble
|
||||
throw new ReviewedStingException("BED files must be parsed through Tribble; parsing them as intervals through the GATK engine is no longer supported");
|
||||
}
|
||||
else {
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.interval;
|
|||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
|
|
@ -52,22 +51,13 @@ public class StringToGenomeLocIteratorAdapter implements Iterator<GenomeLoc> {
|
|||
|
||||
private PushbackIterator<String> it = null;
|
||||
|
||||
public enum FORMAT { BED, GATK };
|
||||
|
||||
FORMAT myFormat = FORMAT.GATK;
|
||||
|
||||
public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser,Iterator<String> it, FORMAT format) {
|
||||
public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser, Iterator<String> it) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.it = new PushbackIterator<String>(it);
|
||||
myFormat = format;
|
||||
}
|
||||
|
||||
public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser,Iterator<String> it ) {
|
||||
this(genomeLocParser,it,FORMAT.GATK);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
String s = null;
|
||||
String s;
|
||||
boolean success = false;
|
||||
|
||||
// skip empty lines:
|
||||
|
|
@ -83,9 +73,7 @@ public class StringToGenomeLocIteratorAdapter implements Iterator<GenomeLoc> {
|
|||
}
|
||||
|
||||
public GenomeLoc next() {
|
||||
|
||||
if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeLoc(it.next());
|
||||
return BedParser.parseLocation( genomeLocParser,it.next() );
|
||||
return genomeLocParser.parseGenomeLoc(it.next());
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
|
|||
collect.downsampleFraction = null;
|
||||
collect.downsampleCoverage = null;
|
||||
collect.intervals = new ArrayList<IntervalBinding<Feature>>();
|
||||
collect.intervals.add(new IntervalBinding<Feature>("intervals".toLowerCase()));
|
||||
//collect.intervals.add(new IntervalBinding<Feature>("intervals".toLowerCase()));
|
||||
collect.excludeIntervals = new ArrayList<IntervalBinding<Feature>>();
|
||||
collect.numberOfThreads = 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,68 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.bed;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.List;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
|
||||
|
||||
public class BedParserUnitTest extends BaseTest {
|
||||
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private File bedFile = new File("public/testdata/sampleBedFile.bed");
|
||||
|
||||
@BeforeClass
|
||||
public void beforeTests() {
|
||||
File referenceFile = new File(b36KGReference);
|
||||
try {
|
||||
seq = new CachingIndexedFastaSequenceFile(referenceFile);
|
||||
}
|
||||
catch(FileNotFoundException ex) {
|
||||
throw new UserException.CouldNotReadInputFile(referenceFile,ex);
|
||||
}
|
||||
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLoadBedFile() {
|
||||
BedParser parser = new BedParser(genomeLocParser,bedFile);
|
||||
List<GenomeLoc> location = parser.getLocations();
|
||||
Assert.assertEquals(location.size(), 4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBedParsing() {
|
||||
BedParser parser = new BedParser(genomeLocParser,bedFile);
|
||||
List<GenomeLoc> location = parser.getLocations();
|
||||
Assert.assertEquals(location.size(), 4);
|
||||
Assert.assertTrue(location.get(0).getContig().equals("20"));
|
||||
Assert.assertTrue(location.get(1).getContig().equals("20"));
|
||||
Assert.assertTrue(location.get(2).getContig().equals("22"));
|
||||
Assert.assertTrue(location.get(3).getContig().equals("22"));
|
||||
|
||||
// now check the the start positions
|
||||
Assert.assertEquals(location.get(0).getStart(), 1);
|
||||
Assert.assertEquals(location.get(1).getStart(), 1002);
|
||||
Assert.assertEquals(location.get(2).getStart(), 1001);
|
||||
Assert.assertEquals(location.get(3).getStart(), 2001);
|
||||
|
||||
// now check the the stop positions
|
||||
Assert.assertEquals(location.get(0).getStop(), 999);
|
||||
Assert.assertEquals(location.get(1).getStop(), 2000);
|
||||
Assert.assertEquals(location.get(2).getStop(), 5000);
|
||||
Assert.assertEquals(location.get(3).getStop(), 6000);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue