Added option for min number of matches between reads and alleles required to consider reads.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3548 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2010-06-14 16:08:49 +00:00
parent 4ab1f440c3
commit c38390eabb
5 changed files with 50 additions and 22 deletions

View File

@ -52,8 +52,11 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
public String ethnicity = "CaucasianUSA"; public String ethnicity = "CaucasianUSA";
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = ""; public String filterFile = "";
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 7; public int MAXALLOWEDMISMATCHES = 6;
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
public int MINREQUIREDMATCHES = 0;
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam"; String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq"; String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
@ -83,7 +86,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
if (!filterFile.equals("")){ if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... "); out.printf("INFO Reading properties file ... ");
SimilarityFileReader similarityReader = new SimilarityFileReader(); SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
ReadsToDiscard = similarityReader.GetReadsToDiscard(); ReadsToDiscard = similarityReader.GetReadsToDiscard();
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size()); out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
for (int i = 0; i < ReadsToDiscard.size(); i++){ for (int i = 0; i < ReadsToDiscard.size(); i++){

View File

@ -17,8 +17,11 @@ public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = ""; public String filterFile = "";
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 5; public int MAXALLOWEDMISMATCHES = 7;
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
public int MINREQUIREDMATCHES = 0;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
@ -70,7 +73,7 @@ public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
if (!filterFile.equals("")){ if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... "); out.printf("INFO Reading properties file ... ");
SimilarityFileReader similarityReader = new SimilarityFileReader(); SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
ReadsToDiscard = similarityReader.GetReadsToDiscard(); ReadsToDiscard = similarityReader.GetReadsToDiscard();
MaxNumMatches = similarityReader.GetNumMatches(); MaxNumMatches = similarityReader.GetNumMatches();
MaxConcordance = similarityReader.GetConcordance(); MaxConcordance = similarityReader.GetConcordance();

View File

@ -34,11 +34,11 @@ import org.broadinstitute.sting.commandline.Argument;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Hashtable; import java.util.Hashtable;
/** /**
* Finds the most similar HLA allele for each read. Usage: java -jar GenomeAnalysisTK.jar -T FindClosestAllele -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT * Finds the most similar HLA allele for each read (helps detect misalignments). Usage: java -jar GenomeAnalysisTK.jar -T FindClosestHLA -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval | grep -v INFO | sort -k1 > OUTPUT
* @author shermanjia * @author shermanjia
*/ */
@Requires({DataSource.READS, DataSource.REFERENCE}) @Requires({DataSource.READS, DataSource.REFERENCE})
public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> { public class FindClosestHLAWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false) @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
public String debugRead = ""; public String debugRead = "";

View File

@ -36,7 +36,7 @@ import java.util.*;
import java.util.Map.Entry; import java.util.Map.Entry;
/** /**
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\ * Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T HLACaller -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
* @author shermanjia * @author shermanjia
*/ */
@ -63,8 +63,11 @@ public class HLACallerWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false; public boolean ONLYFREQUENT = false;
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 7; public int MAXALLOWEDMISMATCHES = 6;
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
public int MINREQUIREDMATCHES = 5;
GATKArgumentCollection args = this.getToolkit().getArguments(); GATKArgumentCollection args = this.getToolkit().getArguments();
@ -113,15 +116,34 @@ public class HLACallerWalker extends ReadWalker<Integer, Integer> {
HLAstoppos = HLADictionaryReader.GetStopPositions(); HLAstoppos = HLADictionaryReader.GetStopPositions();
//Load pre-processing file for misaligned reads and list of alleles to search //Load pre-processing file for misaligned reads and list of alleles to search
SimilarityFileReader similarityReader = new SimilarityFileReader(); if (!filterFile.equals("")){
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); //If pre-processing file exists, load contents
ReadsToDiscard = similarityReader.GetReadsToDiscard(); SimilarityFileReader similarityReader = new SimilarityFileReader();
AllelesToSearch = similarityReader.GetAllelesToSearch(); similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
AlleleCount = similarityReader.GetAlleleCount(); ReadsToDiscard = similarityReader.GetReadsToDiscard();
LocusCount = similarityReader.GetLocusCount(); AllelesToSearch = similarityReader.GetAllelesToSearch();
for (int i = 0; i < AllelesToSearch.size(); i++){ AlleleCount = similarityReader.GetAlleleCount();
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i))); LocusCount = similarityReader.GetLocusCount();
for (int i = 0; i < AllelesToSearch.size(); i++){
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
}
}else{
ReadsToDiscard = new ArrayList<String>();
AlleleCount = new Hashtable();
String name, d4_name; String [] n;
for (int i = 0; i < HLAnames.length; i++){
name = HLAnames[i].substring(4);
n = name.split("\\*");
d4_name = n[0] + "*" + n[1].substring(0, 4);
if (!AllelesToSearch.contains(d4_name)){
AllelesToSearch.add(d4_name);
AlleleCount.put(d4_name, 0);
}
if (!LocusCount.containsKey(n[0])){
LocusCount.put(n[0], 0);
}
}
} }
//Load genotypes and find polymorphic sites (sites that differ from reference) //Load genotypes and find polymorphic sites (sites that differ from reference)

View File

@ -53,7 +53,7 @@ public class SimilarityFileReader {
return NumMismatches; return NumMismatches;
} }
public void ReadFile(String filename, int minAllowedMismatches){ public void ReadFile(String filename, int minAllowedMismatches, int minRequiredMatches){
try{ try{
FileInputStream fstream = new FileInputStream(filename); FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream); DataInputStream in = new DataInputStream(fstream);
@ -70,7 +70,7 @@ public class SimilarityFileReader {
Concordance.put(s[0],matchFraction); Concordance.put(s[0],matchFraction);
NumMatches.put(s[0], s[4]); NumMatches.put(s[0], s[4]);
NumMismatches.put(s[0], numMismatches); NumMismatches.put(s[0], numMismatches);
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){ if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < minRequiredMatches){
ReadsToDiscard.add(s[0]); ReadsToDiscard.add(s[0]);
}else{ }else{
Hashtable fourDigitAlleles = new Hashtable(); Hashtable fourDigitAlleles = new Hashtable();