Added option for min number of matches between reads and alleles required to consider reads.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3548 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4ab1f440c3
commit
c38390eabb
|
|
@ -52,8 +52,11 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
public String ethnicity = "CaucasianUSA";
|
||||
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||
public String filterFile = "";
|
||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MINALLOWEDMISMATCHES = 7;
|
||||
@Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MAXALLOWEDMISMATCHES = 6;
|
||||
|
||||
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
|
||||
public int MINREQUIREDMATCHES = 0;
|
||||
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
|
|
@ -83,7 +86,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
if (!filterFile.equals("")){
|
||||
out.printf("INFO Reading properties file ... ");
|
||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||
similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
|
||||
for (int i = 0; i < ReadsToDiscard.size(); i++){
|
||||
|
|
|
|||
|
|
@ -17,8 +17,11 @@ public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||
public String filterFile = "";
|
||||
|
||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MINALLOWEDMISMATCHES = 5;
|
||||
@Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MAXALLOWEDMISMATCHES = 7;
|
||||
|
||||
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
|
||||
public int MINREQUIREDMATCHES = 0;
|
||||
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
|
|
@ -70,7 +73,7 @@ public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
|
|||
if (!filterFile.equals("")){
|
||||
out.printf("INFO Reading properties file ... ");
|
||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||
similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
MaxNumMatches = similarityReader.GetNumMatches();
|
||||
MaxConcordance = similarityReader.GetConcordance();
|
||||
|
|
|
|||
|
|
@ -34,11 +34,11 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
/**
|
||||
* Finds the most similar HLA allele for each read. Usage: java -jar GenomeAnalysisTK.jar -T FindClosestAllele -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT
|
||||
* Finds the most similar HLA allele for each read (helps detect misalignments). Usage: java -jar GenomeAnalysisTK.jar -T FindClosestHLA -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval | grep -v INFO | sort -k1 > OUTPUT
|
||||
* @author shermanjia
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||
public class FindClosestHLAWalker extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
|
||||
public String debugRead = "";
|
||||
|
||||
|
|
@ -36,7 +36,7 @@ import java.util.*;
|
|||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
|
||||
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T HLACaller -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
|
||||
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
|
||||
* @author shermanjia
|
||||
*/
|
||||
|
|
@ -63,8 +63,11 @@ public class HLACallerWalker extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MINALLOWEDMISMATCHES = 7;
|
||||
@Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MAXALLOWEDMISMATCHES = 6;
|
||||
|
||||
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
|
||||
public int MINREQUIREDMATCHES = 5;
|
||||
|
||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||
|
||||
|
|
@ -113,15 +116,34 @@ public class HLACallerWalker extends ReadWalker<Integer, Integer> {
|
|||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
|
||||
//Load pre-processing file for misaligned reads and list of alleles to search
|
||||
|
||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
AllelesToSearch = similarityReader.GetAllelesToSearch();
|
||||
AlleleCount = similarityReader.GetAlleleCount();
|
||||
LocusCount = similarityReader.GetLocusCount();
|
||||
for (int i = 0; i < AllelesToSearch.size(); i++){
|
||||
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
|
||||
|
||||
if (!filterFile.equals("")){
|
||||
//If pre-processing file exists, load contents
|
||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||
similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
AllelesToSearch = similarityReader.GetAllelesToSearch();
|
||||
AlleleCount = similarityReader.GetAlleleCount();
|
||||
LocusCount = similarityReader.GetLocusCount();
|
||||
for (int i = 0; i < AllelesToSearch.size(); i++){
|
||||
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
|
||||
}
|
||||
}else{
|
||||
ReadsToDiscard = new ArrayList<String>();
|
||||
AlleleCount = new Hashtable();
|
||||
String name, d4_name; String [] n;
|
||||
for (int i = 0; i < HLAnames.length; i++){
|
||||
name = HLAnames[i].substring(4);
|
||||
n = name.split("\\*");
|
||||
d4_name = n[0] + "*" + n[1].substring(0, 4);
|
||||
if (!AllelesToSearch.contains(d4_name)){
|
||||
AllelesToSearch.add(d4_name);
|
||||
AlleleCount.put(d4_name, 0);
|
||||
}
|
||||
if (!LocusCount.containsKey(n[0])){
|
||||
LocusCount.put(n[0], 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Load genotypes and find polymorphic sites (sites that differ from reference)
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ public class SimilarityFileReader {
|
|||
return NumMismatches;
|
||||
}
|
||||
|
||||
public void ReadFile(String filename, int minAllowedMismatches){
|
||||
public void ReadFile(String filename, int minAllowedMismatches, int minRequiredMatches){
|
||||
try{
|
||||
FileInputStream fstream = new FileInputStream(filename);
|
||||
DataInputStream in = new DataInputStream(fstream);
|
||||
|
|
@ -70,7 +70,7 @@ public class SimilarityFileReader {
|
|||
Concordance.put(s[0],matchFraction);
|
||||
NumMatches.put(s[0], s[4]);
|
||||
NumMismatches.put(s[0], numMismatches);
|
||||
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){
|
||||
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < minRequiredMatches){
|
||||
ReadsToDiscard.add(s[0]);
|
||||
}else{
|
||||
Hashtable fourDigitAlleles = new Hashtable();
|
||||
|
|
|
|||
Loading…
Reference in New Issue