gatk-3.8/R/phasing/scripts/plot_all.theoretical_and_em...

182 lines
8.1 KiB
R
Raw Normal View History

NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE))
NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site
#
#USE_EMPIRICAL_WINDOWS = c(10, 2)
#
USE_EMPIRICAL_WINDOWS = c(2)
TWO_COLORS = c("red", "darkgreen")
######################################################################
# Phasing as a function of SPECIFIC intra-het distances:
######################################################################
load("testSpecificDistances.RData")
MAX_DISTANCE = 10^3
PQ_PHASING_THRESH = 10.0
distances = list()
phaseRateDistances = list()
distancesLeg = vector()
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
n = USE_EMPIRICAL_WINDOWS[nextIndex]
n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0))
n_distance <- n_locDistancePQReadsWindow$distance
n_PQ <- n_locDistancePQReadsWindow$PQ
distanceVector = sort(unique(n_distance))
distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)]
numDists = length(distanceVector)
phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists))
print(paste("numDists= ", numDists, sep=""))
print(paste(distanceVector, collapse=", "))
for (i in 1:numDists) {
d = distanceVector[i]
print(paste("d= ", d, sep=""))
dInds = which(n_distance == d)
phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds)
}
distances[nextIndex] = list(distanceVector)
phaseRateDistances[nextIndex] = list(phasedFractionVector)
distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="")
}
nextIndex = nextIndex+1
distances[nextIndex] = list(DISTANCES)
phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead)
distancesLeg[nextIndex] = "Theoretical (window = 2)" # params
scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS)
######################################################################
# Phasing as a function of depth:
######################################################################
load("testDepths.RData")
depths = list()
phaseRateDepths = list()
depthsLeg = vector()
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
n = USE_EMPIRICAL_WINDOWS[nextIndex]
RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0))
depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc)
phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES)
depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="")
}
nextIndex = nextIndex+1
useLength = which(READ_LENGTHS == 101)
depths[nextIndex] = depthsX[useLength]
phaseRateDepths[nextIndex] = depthsY[useLength]
depthsLeg[nextIndex] = "Theoretical (window = 2)" # params
scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
######################################################################
# Distribution of intra-het distances:
######################################################################
load("intraHetDistancesDistrib.RData")
empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1
empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST
empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE)
empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances)
scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS)
######################################################################
# Phasing as a function of MEAN intra-het distance:
######################################################################
load("testIntraHetDistances.RData")
hetDistances = list()
phaseRateHetDistances = list()
hetDistancesLeg = vector()
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
n = USE_EMPIRICAL_WINDOWS[nextIndex]
meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0))
hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist)
phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased)
hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="")
}
nextIndex = nextIndex+1
hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES)
phaseRateHetDistances[nextIndex] = list(pPhaseTheta)
hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS)
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000))
######################################################################
# Phasing as a function of window size:
######################################################################
load("theoretical_window_on_empirical.RData")
windows = list()
phaseRateWindows = list()
windowsLeg = vector()
NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE))
NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE))
NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome
windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0))
windows[1] = list(windowPhasedConsistent$window)
phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES)
windowsLeg[1] = paste("HiSeq", sep="")
windows[2] = list(WINDOW_SIZES)
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
windowsLeg[2] = "Theoretical" # params
scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
# Use numerical integration over theoretical distances distribution:
load("testWindows.RData")
doneInds = which(pPhaseWindow != -1)
windows[2] = list(WINDOW_SIZES[doneInds])
phaseRateWindows[2] = list(pPhaseWindow[doneInds])
windowsLeg[2] = "Theoretical" # params
scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
# Use theoretical sampling of distances:
load("theoretical_window.RData")
windows[2] = list(WINDOW_SIZES)
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
windowsLeg[2] = "Theoretical" # params
scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)