Remove support for BED file interval parsing in the GATK; it should all go through Tribble now. IndelRealigner no longer supports unordered interval input (which shouldn't have been used anyways). Temporarily commenting out serialization of arguments so that tests pass; this whole piece will be deleted soon anyways.

This commit is contained in:
Eric Banks 2011-10-27 13:38:08 -04:00
parent f7df8bdecc
commit 4a7e6fee3f
9 changed files with 28 additions and 236 deletions

View File

@ -25,14 +25,11 @@
package org.broadinstitute.sting.commandline;
import com.google.java.contract.Requires;
import net.sf.samtools.util.CloseableIterator;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;

View File

@ -89,18 +89,18 @@ public class GATKArgumentCollection {
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
*/
@ElementList(required = false)
//@ElementList(required = false)
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> intervals = null;
public List<IntervalBinding<Feature>> intervals = Collections.emptyList();
/**
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
*/
@ElementList(required = false)
//@ElementList(required = false)
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> excludeIntervals = null;
public List<IntervalBinding<Feature>> excludeIntervals = Collections.emptyList();
/**
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions

View File

@ -229,14 +229,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam")
protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false;
/**
* For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception.
* Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.
*/
@Advanced
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted")
protected boolean TARGET_NOT_SORTED = false;
/**
* Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that
* the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'),
@ -366,30 +358,24 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
catch(FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex);
}
if ( !TARGET_NOT_SORTED ) {
NwayIntervalMergingIterator merger = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY);
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>();
// separate argument on semicolon first
for (String fileOrInterval : intervalsFile.split(";")) {
// if it's a file, add items to raw interval list
if (IntervalUtils.isIntervalFile(fileOrInterval)) {
merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) );
} else {
rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval));
}
NwayIntervalMergingIterator merger = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY);
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>();
// separate argument on semicolon first
for (String fileOrInterval : intervalsFile.split(";")) {
// if it's a file, add items to raw interval list
if (IntervalUtils.isIntervalFile(fileOrInterval)) {
merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) );
} else {
rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval));
}
if ( ! rawIntervals.isEmpty() ) merger.add(rawIntervals.iterator());
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
intervals = merger;
} else {
// read in the whole list of intervals for cleaning
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(),
IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile)),
IntervalMergingRule.OVERLAPPING_ONLY);
intervals = locs.iterator();
}
if ( ! rawIntervals.isEmpty() )
merger.add(rawIntervals.iterator());
// prepare to read intervals one-by-one, as needed
intervals = merger;
currentInterval = intervals.hasNext() ? intervals.next() : null;
writerToUse = writer;

View File

@ -1,104 +0,0 @@
package org.broadinstitute.sting.utils.bed;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: aaron
* Date: Oct 5, 2009
* Time: 5:46:45 PM
*/
public class BedParser {
// the GATk operates as a one based location, bed files are 0 based
static final int TO_ONE_BASED_ADDITION = 1;
// the buffered reader input
private final BufferedReader mIn;
private GenomeLocParser genomeLocParser;
// our array of locations
private List<GenomeLoc> mLocations;
/**
* parse a bed file, given it's location
*
* @param fl
*/
public BedParser(GenomeLocParser genomeLocParser,File fl) {
this.genomeLocParser = genomeLocParser;
try {
mIn = new BufferedReader(new FileReader(fl));
} catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(fl, e);
}
mLocations = parseLocations();
}
/**
* parse a bed file, given an input reader
*
* @param fl the bed file
*/
public BedParser(BufferedReader fl) {
mIn = fl;
mLocations = parseLocations();
}
/**
* parse out the locations
*
* @return a list of GenomeLocs, sorted and merged
*/
private List<GenomeLoc> parseLocations() {
String line = null;
List<GenomeLoc> locArray = new ArrayList<GenomeLoc>();
try {
while ((line = mIn.readLine()) != null) {
locArray.add(parseLocation(genomeLocParser,line));
}
} catch (IOException e) {
throw new UserException.MalformedFile("Unable to parse line in BED file.");
}
return locArray;
}
/**
* parse a single location
*
* @param line the line, as a string
* @return a parsed genome loc
*/
public static GenomeLoc parseLocation(GenomeLocParser genomeLocParser,String line) {
String contig;
int start;
int stop;
try {
String parts[] = line.split("\\s+");
contig = parts[0];
start = Integer.valueOf(parts[1]) + TO_ONE_BASED_ADDITION;
stop = Integer.valueOf(parts[2]); // the ending point is an open interval
} catch (Exception e) {
throw new UserException.MalformedFile("Unable to process bed file line = " + line, e);
}
// we currently drop the rest of the bed record, which can contain names, scores, etc
return genomeLocParser.createGenomeLoc(contig, start, stop, true);
}
/**
* return the sorted, and merged (for overlapping regions)
*
* @return an arraylist
*/
public List<GenomeLoc> getLocations() {
return mLocations;
}
}

View File

@ -61,14 +61,7 @@ public class IntervalFileMergingIterator implements Iterator<GenomeLoc> {
try {
XReadLines reader = new XReadLines(f);
if (f.getName().toUpperCase().endsWith(".BED")) {
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator(),
StringToGenomeLocIteratorAdapter.FORMAT.BED ) ) ;
} else {
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator(),
StringToGenomeLocIteratorAdapter.FORMAT.GATK ) ) ;
}
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator() ));
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(f, e);
}

View File

@ -8,8 +8,8 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.bed.BedParser;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
@ -94,9 +94,9 @@ public class IntervalUtils {
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
// case: BED file
if (file_name.toUpperCase().endsWith(".BED")) {
BedParser parser = new BedParser(glParser,inputFile);
ret.addAll(parser.getLocations());
if ( file_name.toUpperCase().endsWith(".BED") ) {
// this is now supported in Tribble
throw new ReviewedStingException("BED files must be parsed through Tribble; parsing them as intervals through the GATK engine is no longer supported");
}
else {
/**

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.interval;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.bed.BedParser;
import java.util.Iterator;
@ -52,22 +51,13 @@ public class StringToGenomeLocIteratorAdapter implements Iterator<GenomeLoc> {
private PushbackIterator<String> it = null;
public enum FORMAT { BED, GATK };
FORMAT myFormat = FORMAT.GATK;
public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser,Iterator<String> it, FORMAT format) {
public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser, Iterator<String> it) {
this.genomeLocParser = genomeLocParser;
this.it = new PushbackIterator<String>(it);
myFormat = format;
}
public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser,Iterator<String> it ) {
this(genomeLocParser,it,FORMAT.GATK);
}
public boolean hasNext() {
String s = null;
String s;
boolean success = false;
// skip empty lines:
@ -83,9 +73,7 @@ public class StringToGenomeLocIteratorAdapter implements Iterator<GenomeLoc> {
}
public GenomeLoc next() {
if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeLoc(it.next());
return BedParser.parseLocation( genomeLocParser,it.next() );
return genomeLocParser.parseGenomeLoc(it.next());
}
public void remove() {

View File

@ -87,7 +87,7 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
collect.downsampleFraction = null;
collect.downsampleCoverage = null;
collect.intervals = new ArrayList<IntervalBinding<Feature>>();
collect.intervals.add(new IntervalBinding<Feature>("intervals".toLowerCase()));
//collect.intervals.add(new IntervalBinding<Feature>("intervals".toLowerCase()));
collect.excludeIntervals = new ArrayList<IntervalBinding<Feature>>();
collect.numberOfThreads = 1;
}

View File

@ -1,68 +0,0 @@
package org.broadinstitute.sting.utils.bed;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
import net.sf.picard.reference.IndexedFastaSequenceFile;
public class BedParserUnitTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
private File bedFile = new File("public/testdata/sampleBedFile.bed");
@BeforeClass
public void beforeTests() {
File referenceFile = new File(b36KGReference);
try {
seq = new CachingIndexedFastaSequenceFile(referenceFile);
}
catch(FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(referenceFile,ex);
}
genomeLocParser = new GenomeLocParser(seq);
}
@Test
public void testLoadBedFile() {
BedParser parser = new BedParser(genomeLocParser,bedFile);
List<GenomeLoc> location = parser.getLocations();
Assert.assertEquals(location.size(), 4);
}
@Test
public void testBedParsing() {
BedParser parser = new BedParser(genomeLocParser,bedFile);
List<GenomeLoc> location = parser.getLocations();
Assert.assertEquals(location.size(), 4);
Assert.assertTrue(location.get(0).getContig().equals("20"));
Assert.assertTrue(location.get(1).getContig().equals("20"));
Assert.assertTrue(location.get(2).getContig().equals("22"));
Assert.assertTrue(location.get(3).getContig().equals("22"));
// now check the the start positions
Assert.assertEquals(location.get(0).getStart(), 1);
Assert.assertEquals(location.get(1).getStart(), 1002);
Assert.assertEquals(location.get(2).getStart(), 1001);
Assert.assertEquals(location.get(3).getStart(), 2001);
// now check the the stop positions
Assert.assertEquals(location.get(0).getStop(), 999);
Assert.assertEquals(location.get(1).getStop(), 2000);
Assert.assertEquals(location.get(2).getStop(), 5000);
Assert.assertEquals(location.get(3).getStop(), 6000);
}
}