diff --git a/R/phasing/play_RBP_theoretical.R b/R/phasing/play_RBP_theoretical.R new file mode 100644 index 000000000..e261f326f --- /dev/null +++ b/R/phasing/play_RBP_theoretical.R @@ -0,0 +1,54 @@ +# +#options(warn=2) +#options(error=recover) +# + +HALF = high_dimensional_integrate(1, -200, 0, dnorm) +print(paste("Should be ~ HALF: ", HALF, sep="")) + + +k = 75 +#theta = 10^-2 +theta = 10^-3 + +p = pHetPairLteDistance(k, theta) +print(paste(p, " = pHetPairLteDistance(k= ", k, ", theta= ", theta, ")", sep="")) + + +L = 76 +fragmentSize = 452 + + +p = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, fragmentSize, k) +print(paste(p, " = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= ", L, ", fragmentSize= ", fragmentSize, ", k= ", k, ")", sep="")) + +Fm = 392 +Fs = 44 + +p = pFragmentSize(300, Fm, Fs) +print(paste(p, " = pFragmentSize(300, Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) + + +p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs) +print(paste(p, " = pFragmentsReadsCanCoverHetPairAtDistance(L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) + + +meanDepth = 65 +nReadsToPhase = 1 +p = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs) +print(paste(p, " = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) + + +p = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, Fm, Fs) +print(paste(p, " = pDirectlyPhaseHetPair(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) + + +windowDistances = c(100, 100, 100, 100, 100) +phaseIndex = 2 +p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs) +print(paste(p, " = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep="")) + + + +traceback() +warnings() diff --git a/R/phasing/plot.intraHetDistancesDistrib.R b/R/phasing/plot.intraHetDistancesDistrib.R new file mode 100644 index 000000000..ffe0c698c --- /dev/null +++ b/R/phasing/plot.intraHetDistancesDistrib.R @@ -0,0 +1,13 @@ +theta = 10^-3 +params = paste("theta= ", theta, sep="") + +MIN_DIST = 1 +MAX_DIST = 10^4 +BY_DIST = 10 +DISTANCES = seq(from=MIN_DIST, to=MAX_DIST+BY_DIST, by=BY_DIST) +freqAtLteDist = pHetPairLteDistance(DISTANCES, theta) + +scatter(DISTANCES, freqAtLteDist, "intraHetDistancesDistrib", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", main=params) + + +save(list = ls(all=TRUE), file = "intraHetDistancesDistrib.RData") diff --git a/R/phasing/plot.testDepths.R b/R/phasing/plot.testDepths.R new file mode 100644 index 000000000..8d747a248 --- /dev/null +++ b/R/phasing/plot.testDepths.R @@ -0,0 +1,39 @@ +theta = 10^-3 + +Fm_BASE = 392 - 2 * 101 # The mean insert size == 190 +Fs = 44 + +nReadsToPhase = 1 + +params = paste("nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="") + + + +MEAN_DEPTHS = 0:65 +NUM_DEPTHS = length(MEAN_DEPTHS) + +READ_LENGTHS = c(18, 36, 76, 101, 125, 150, 175, 200, 400, 800, 1000) +READ_LENGTHS = rev(READ_LENGTHS) +NUM_READ_LENGTHS = length(READ_LENGTHS) + +depthsX = list() +depthsY = list() +depthsLeg = vector() + +for (i in 1:NUM_READ_LENGTHS) { + pPhaseDepth = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_DEPTHS)) + Fm = Fm_BASE + 2 * READ_LENGTHS[i] + for (j in 1:NUM_DEPTHS) { + pPhaseDepth[j] = pDirectlyPhaseHetPair(MEAN_DEPTHS[j], nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs) + } + depthsX[i] = list(MEAN_DEPTHS) + depthsY[i] = list(pPhaseDepth) + depthsLeg[i] = paste("L= ", READ_LENGTHS[i], sep="") +} + +scatter(depthsX, depthsY, "testDepths", xlab="Mean depth", ylab="Phaseability", main=params, leg=depthsLeg, legPos="topleft", width=14, height=7, type="b") + + + + +save(list = ls(all=TRUE), file = "testDepths.RData") diff --git a/R/phasing/plot.testFragments.R b/R/phasing/plot.testFragments.R new file mode 100644 index 000000000..7ae39c44c --- /dev/null +++ b/R/phasing/plot.testFragments.R @@ -0,0 +1,47 @@ +theta = 10^-3 + +L = 101 + +meanDepth = 65 +nReadsToPhase = 1 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, sep="") + + + +MEAN_SIZES = seq(1,2000,20) +STD_SIZES = seq(0,200,5) + + +testFragments = matrix(nrow=length(MEAN_SIZES), ncol=length(STD_SIZES)) +for (i in 1:length(MEAN_SIZES)) { + test_mean_fragment_size = MEAN_SIZES[i] + print(paste("test_mean_fragment_size: ", test_mean_fragment_size, sep="")) + for (j in 1:length(STD_SIZES)) { + test_std_fragment_size = STD_SIZES[j] + print(paste("test_std_fragment_size: ", test_std_fragment_size, sep="")) + + testFragments[i,j] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, test_mean_fragment_size, test_std_fragment_size) + } +} + + +pdf('testFragments.pdf') + +library(gplots) +heatmap.2(testFragments, ylab = "Mean fragment size", xlab = "Standard deviation fragment size", labRow = MEAN_SIZES, labCol = STD_SIZES, Rowv = NA, Colv = NA, dendrogram = "none", scale="none", revC = FALSE, density.info="none", trace="none", main=params) + +library(scatterplot3d) +xMeans = as.vector(t(matrix(rep.int(MEAN_SIZES, length(STD_SIZES)), ncol = length(STD_SIZES)))) +yStds = rep.int(STD_SIZES, length(MEAN_SIZES)) +zPhaseRate = as.vector(t(testFragments)) +scatterplot3d(xMeans, yStds, zPhaseRate, xlab = "Mean fragment size", ylab = "Standard deviation fragment size", zlab = "Phasing rate", main=params) + +bestCombo = which.max(zPhaseRate) +print(paste("For ", params, ", BEST choice gives phaseability of ", zPhaseRate[bestCombo], " using mean fragment = ", xMeans[bestCombo], ", std. fragment = ", yStds[bestCombo], sep = "")) +dev.off() + + + + +save(list = ls(all=TRUE), file = "testFragments.RData") diff --git a/R/phasing/plot.testIntraHetDistances.R b/R/phasing/plot.testIntraHetDistances.R new file mode 100644 index 000000000..cb5f857c4 --- /dev/null +++ b/R/phasing/plot.testIntraHetDistances.R @@ -0,0 +1,25 @@ +L = 101 + +Fm = 392 +Fs = 44 + +meanDepth = 65 +nReadsToPhase = 1 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") + + + +MEAN_INTRA_HET_DISTANCES = seq(from=2, to=20002, by=50) +THETAS = meanIntraHetDistanceToTheta(MEAN_INTRA_HET_DISTANCES) +NUM_THETAS = length(THETAS) + +pPhaseTheta = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_THETAS)) +for (i in 1:NUM_THETAS) { + pPhaseTheta[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, THETAS[i], Fm, Fs) +} +scatter(MEAN_INTRA_HET_DISTANCES, pPhaseTheta, "testIntraHetDistances", xlab="Mean intra-het distance", ylab="Phaseability", main=params, type="b") + + + +save(list = ls(all=TRUE), file = "testIntraHetDistances.RData") diff --git a/R/phasing/plot.testReadLengths.R b/R/phasing/plot.testReadLengths.R new file mode 100644 index 000000000..fb3316749 --- /dev/null +++ b/R/phasing/plot.testReadLengths.R @@ -0,0 +1,24 @@ +theta = 10^-3 + +Fm_BASE = 392 - 2 * 101 # The mean insert size == 190 +Fs = 44 + +meanDepth = 65 +nReadsToPhase = 1 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="") + + +READ_LENGTHS = seq(from=30, to=1000, by=10) +NUM_READ_LENGTHS = length(READ_LENGTHS) + +pPhaseReadLength = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_READ_LENGTHS)) +for (i in 1:NUM_READ_LENGTHS) { + Fm = Fm_BASE + 2 * READ_LENGTHS[i] + pPhaseReadLength[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs) +} +scatter(READ_LENGTHS, pPhaseReadLength, "testReadLengths", xlab="Read length", ylab="Phaseability", main=params, type="b") + + + +save(list = ls(all=TRUE), file = "testReadLengths.RData") diff --git a/R/phasing/plot.testSpecificDistances.R b/R/phasing/plot.testSpecificDistances.R new file mode 100644 index 000000000..8dfc0b2a6 --- /dev/null +++ b/R/phasing/plot.testSpecificDistances.R @@ -0,0 +1,19 @@ +L = 101 + +Fm = 392 +Fs = 44 + +meanDepth = 65 +nReadsToPhase = 1 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") + + +DISTANCES = 0:1000 +pPhaseHetPairAtDistWithRead = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, DISTANCES, Fm, Fs) + +scatter(DISTANCES, pPhaseHetPairAtDistWithRead, "testSpecificDistances", xlab="Intra-het distance", ylab="Phaseability", main=params) + + + +save(list = ls(all=TRUE), file = "testSpecificDistances.RData") diff --git a/R/phasing/plot.testSpecificFragments.R b/R/phasing/plot.testSpecificFragments.R new file mode 100644 index 000000000..099cf5b3f --- /dev/null +++ b/R/phasing/plot.testSpecificFragments.R @@ -0,0 +1,8 @@ +L = 76 +k = 75 +params = paste("L= ", L, ", k= ", k, sep="") + +FRAGMENT_SIZES = 0:100 + 2 * L +pCoverHetPairWithRead = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, FRAGMENT_SIZES, k) + +scatter(FRAGMENT_SIZES, pCoverHetPairWithRead, "testSpecificFragments", xlab="Fragment size", ylab="Probability of covering het pair", main=params) diff --git a/R/phasing/plot.testWindows.R b/R/phasing/plot.testWindows.R new file mode 100644 index 000000000..dd9cc1eee --- /dev/null +++ b/R/phasing/plot.testWindows.R @@ -0,0 +1,32 @@ +theta = 10^-3 + +Fm = 392 +Fs = 44 + +L = 101 + +meanDepth = 65 +nReadsToPhase = 1 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, sep="") + +# +#options(warn=2) +#options(error=recover) +# + +MAX_WINDOW_SIZE = 10 + +WINDOW_SIZES = 2:MAX_WINDOW_SIZE +NUM_WINDOW_SIZES = length(WINDOW_SIZES) + +pPhaseWindow = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_WINDOW_SIZES)) +for (i in 1:NUM_WINDOW_SIZES) { + n = WINDOW_SIZES[i] + phaseIndex = middleOfWindowIndex(n) + pPhaseWindow[i] = pDirectlyPhaseHetPairUsingWindow(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex) + + save(list = ls(all=TRUE), file = "testWindows.RData") +} + +scatter(WINDOW_SIZES, pPhaseWindow, "testWindows", xlab="Window size", ylab="Phaseability", main=params, type="b") diff --git a/R/phasing/plot.theoretical_window.R b/R/phasing/plot.theoretical_window.R new file mode 100644 index 000000000..f14acaf3f --- /dev/null +++ b/R/phasing/plot.theoretical_window.R @@ -0,0 +1,28 @@ +L = 101 + +Fm = 392 +Fs = 44 + +meanDepth = 65 +nReadsToPhase = 1 + +theta = 10^-3 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") + + +MAX_NUM_DISTS = 10^4 +distances = sampleIntraHetDistances(MAX_NUM_DISTS, theta) +print(paste("Using ", MAX_NUM_DISTS, " THEORETICAL distances...", sep="")) + + +MAX_WINDOW_SIZE = 10 +FILE_NAME = "theoretical_window" + +phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME) +phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow +WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES + +phaseProbsWindow = colMeans(phaseProbsPositionWindow) + +scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b") diff --git a/R/phasing/plot.theoretical_window_on_empirical.R b/R/phasing/plot.theoretical_window_on_empirical.R new file mode 100644 index 000000000..b1463538b --- /dev/null +++ b/R/phasing/plot.theoretical_window_on_empirical.R @@ -0,0 +1,30 @@ +L = 101 + +Fm = 392 +Fs = 44 + +meanDepth = 65 +nReadsToPhase = 1 + +params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="") + + +distances = scan("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt", what=list(dist=0)) +distances = distances$dist + +MAX_NUM_DISTS = 10^4 +NUM_DISTS_TO_USE = min(MAX_NUM_DISTS, length(distances)) +distances = distances[1:NUM_DISTS_TO_USE] +print(paste("Using ", NUM_DISTS_TO_USE, " EMPIRICAL distances...", sep="")) + + +MAX_WINDOW_SIZE = 10 +FILE_NAME = "theoretical_window_on_empirical" + +phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME) +phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow +WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES + +phaseProbsWindow = colMeans(phaseProbsPositionWindow) + +scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b") diff --git a/R/phasing/plot_all.theoretical_and_empirical.R b/R/phasing/plot_all.theoretical_and_empirical.R new file mode 100644 index 000000000..31b093578 --- /dev/null +++ b/R/phasing/plot_all.theoretical_and_empirical.R @@ -0,0 +1,181 @@ +NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE)) +NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site + + +# +#USE_EMPIRICAL_WINDOWS = c(10, 2) +# +USE_EMPIRICAL_WINDOWS = c(2) + + +TWO_COLORS = c("red", "darkgreen") + + +###################################################################### +# Phasing as a function of SPECIFIC intra-het distances: +###################################################################### +load("testSpecificDistances.RData") + +MAX_DISTANCE = 10^3 +PQ_PHASING_THRESH = 10.0 + +distances = list() +phaseRateDistances = list() +distancesLeg = vector() + +for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) { + n = USE_EMPIRICAL_WINDOWS[nextIndex] + n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0)) + n_distance <- n_locDistancePQReadsWindow$distance + n_PQ <- n_locDistancePQReadsWindow$PQ + + distanceVector = sort(unique(n_distance)) + distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)] + numDists = length(distanceVector) + + phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists)) + + print(paste("numDists= ", numDists, sep="")) + print(paste(distanceVector, collapse=", ")) + + for (i in 1:numDists) { + d = distanceVector[i] + print(paste("d= ", d, sep="")) + + dInds = which(n_distance == d) + phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds) + } + + distances[nextIndex] = list(distanceVector) + phaseRateDistances[nextIndex] = list(phasedFractionVector) + distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="") +} + +nextIndex = nextIndex+1 +distances[nextIndex] = list(DISTANCES) +phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead) +distancesLeg[nextIndex] = "Theoretical (window = 2)" # params + +scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS) + + + +###################################################################### +# Phasing as a function of depth: +###################################################################### +load("testDepths.RData") + +depths = list() +phaseRateDepths = list() +depthsLeg = vector() + +for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) { + n = USE_EMPIRICAL_WINDOWS[nextIndex] + RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0)) + depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc) + phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES) + depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="") +} + +nextIndex = nextIndex+1 +useLength = which(READ_LENGTHS == 101) +depths[nextIndex] = depthsX[useLength] +phaseRateDepths[nextIndex] = depthsY[useLength] +depthsLeg[nextIndex] = "Theoretical (window = 2)" # params + +scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) + + + +###################################################################### +# Distribution of intra-het distances: +###################################################################### +load("intraHetDistancesDistrib.RData") + +empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1 +empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST + +empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE) +empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances) + +scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS) + + + +###################################################################### +# Phasing as a function of MEAN intra-het distance: +###################################################################### +load("testIntraHetDistances.RData") + +hetDistances = list() +phaseRateHetDistances = list() +hetDistancesLeg = vector() + +for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) { + n = USE_EMPIRICAL_WINDOWS[nextIndex] + meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0)) + + hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist) + phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased) + hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="") +} + +nextIndex = nextIndex+1 +hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES) +phaseRateHetDistances[nextIndex] = list(pPhaseTheta) +hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params + +scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS) + +scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000)) + + +###################################################################### +# Phasing as a function of window size: +###################################################################### +load("theoretical_window_on_empirical.RData") + +windows = list() +phaseRateWindows = list() +windowsLeg = vector() + +NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE)) +NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE)) +NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome + + +windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0)) +windows[1] = list(windowPhasedConsistent$window) +phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES) +windowsLeg[1] = paste("HiSeq", sep="") + + +windows[2] = list(WINDOW_SIZES) +phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow))) +windowsLeg[2] = "Theoretical" # params + +scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) + + + +# Use numerical integration over theoretical distances distribution: +load("testWindows.RData") + +doneInds = which(pPhaseWindow != -1) + +windows[2] = list(WINDOW_SIZES[doneInds]) +phaseRateWindows[2] = list(pPhaseWindow[doneInds]) +windowsLeg[2] = "Theoretical" # params + +scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS) + + + +# Use theoretical sampling of distances: +load("theoretical_window.RData") + +windows[2] = list(WINDOW_SIZES) +phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow))) +windowsLeg[2] = "Theoretical" # params + +scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)