# you need to complete the r code and a singlepage document c.pdf

22 de Mar de 2023
1 de 26

### you need to complete the r code and a singlepage document c.pdf

• 1. you need to complete the r code and a single-page document containing two figures, report the parameters you estimate and discuss how well your power law fits the network data, and explain the finding. Question: images incomplete r code: # IDS 564 - Spring 2023 # Lab 4 R Code - Estimating the Degree Exponent of a Scale-free Network #========================================================================= ===================== # 0. INITIATION ========================================================================== = #========================================================================= ===================== ## You'll need VGAM for the zeta function # install.packages("VGAM") ## When prompted to install from binary version, select no library(VGAM) ## You'll need this when calculating goodness of fit # install.packages("parallel") library(parallel) library(ggplot2) library(ggthemes) library(dplyr) library(tidyr) ##------------------------------------------------------------------------------ ## This function will calculate the zeta function for you. You don't need to worry about it! Run it and continue. ## gen_zeta(gamma , shift) will give you a number gen_zeta <- function (gamma, shift = 1, deriv = 0) { deriv.arg <- deriv rm(deriv) if (!is.Numeric(deriv.arg, length.arg = 1, integer.valued = TRUE))
• 2. stop("'deriv' must be a single non-negative integer") if (deriv.arg < 0 || deriv.arg > 2) stop("'deriv' must be 0, 1, or 2") if (deriv.arg > 0) return(zeta.specials(Zeta.derivative(gamma, deriv.arg = deriv.arg, shift = shift), gamma, deriv.arg, shift)) if (any(special <- Re(gamma) <= 1)) { ans <- gamma ans[special] <- Inf special3 <- Re(gamma) < 1 ans[special3] <- NA special4 <- (0 < Re(gamma)) & (Re(gamma) < 1) & (Im(gamma) == 0) # ans[special4] <- Zeta.derivative(gamma[special4], deriv.arg = deriv.arg, shift = shift) special2 <- Re(gamma) < 0 if (any(special2)) { gamma2 <- gamma[special2] cgamma <- 1 - gamma2 ans[special2] <- 2^(gamma2) * pi^(gamma2 - 1) * sin(pi * gamma2/2) * gamma(cgamma) * Recall(cgamma) } if (any(!special)) { ans[!special] <- Recall(gamma[!special]) } return(zeta.specials(ans, gamma, deriv.arg, shift)) } aa <- 12 ans <- 0 for (ii in 0:(aa - 1)) ans <- ans + 1/(shift + ii)^gamma ans <- ans + Zeta.aux(shape = gamma, aa, shift = shift) ans[shift <= 0] <- NaN zeta.specials(ans, gamma, deriv.arg = deriv.arg, shift = shift) } ## example: gen_zeta(2.1, 4) ##------------------------------------------------------------------------------ ## The P_k (the CDF) P_k = function(gamma, k, k_sat){ ### fill the function return(1 - ( gen_zeta(gamma, k) / ... )) }
• 3. ##------------------------------------------------------------------------------ my_theme <- theme_classic() + theme(legend.position = "bottom", legend.box = "horizontal", legend.direction = "horizontal", title = element_text(size = 18), axis.title = element_text(size = 14), axis.text.y = element_text(size = 16), axis.text.x = element_text(size = 16), strip.text = element_text(size = 14), strip.background = element_blank(), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.title = element_text(size = 14), legend.text = element_text(size = 14)) set.seed(123) #========================================================================= ===================== # 00. LOADING DATA ======================================================================== #========================================================================= ===================== ## Load data - fill the path of the folder where you put the file. If the file is in the same folder as the R code remove the path part. your_path = "your path" pat_citation_deg = read.csv(paste0(your_path, "lab 4 data - sample_patant_citation_deg.csv")) head(pat_citation_deg) tail(pat_citation_deg) summary(pat_citation_deg) ## let's have a look at the Log-log degree distribution plot (nothing to fill) p <- ggplot(pat_citation_deg, aes(x = degree)) + geom_point(stat = 'bin', color = "blue", size = 2.5, bins = 3 * ceiling(log(nrow(pat_citation_deg))))+ scale_x_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + scale_y_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + labs(x = "Degree", y = "Frequency") + my_theme + theme(title = element_text(size = 12 )) ## fit a line to the Log-log degree distribution (nothing to fill) p + geom_smooth(data = ggplot_build(p)\$data[[1]] %>% filter(!is.infinite(y)), ## this will take the binned data generated by ggplot to fit the line mapping = aes(x=exp(x), y= exp(y)), method = "lm", se=FALSE, color = "red", size = 0.75, alpha = 0.5)
• 4. #========================================================================= ===================== # 1. EXERCISE PART 1 - Estimating Gamma =================================================== #========================================================================= ===================== ## designate the data.frame to be used - and use standardized column names: id, degree (nothing to fill) my_df = pat_citation_deg %>% rename(id = patent_id) ##------------------------------------------------------------------------------ ## you'll write a for loop over individual unique degrees in the data-set to find the corresponding distance D ### let's create a data.frame with one column as each observed degree in our network; ### We will fill the other columns D and gamma as we calculate them in the loop (nothing to fill) D_df = data.frame(k_sat = unique(my_df\$degree), D= NA, gamma = NA) %>% arrange(k_sat) ## here you set up the maximum degree to check so that you do not have to do the computation for all degrees ### Let's set it up as 25th percentile of the unique observed degrees. Next line of code does that for you: (nothing to fill) max_degree_to_check = as.numeric(ceiling(quantile(unique(my_df\$degree), 0.25))) ### Now discard the rows of D_df you do not need (that are above the max_degree_to_check). Next line of code does it for you: (nothing to fill) D_df = D_df[D_df\$k_sat < max_degree_to_check,] ### Let's take a look at the distance data.frame we are about to fill in the loop: (nothing to fill) head(D_df) tail(D_df) ## Understand and fill parts of the code in this loop ## I recommend setting i = 1 and running each line of this loop on your own and checking what it gives you. This will help you fill the gaps ##------------------------------------------------------------------------------ ## let's work on the loop for (i in 1:(nrow(D_df))) { ## note: the loop starts slower but will speed up as it progresses. ## let's show the current loop k_sat (so that we can see our progress): (nothing to fill) print(paste0("at %", round(100 * i/nrow(D_df), 2))) k_sat_temp = D_df\$k_sat[i]
• 5. ##---------------------------------------------------------------------------- ## let's create a temporary copy of the network degree data that contains degrees equal or above k_sat_temp: (nothing to fill) temp_df = my_df[my_df\$degree>k_sat_temp,] ##---------------------------------------------------------------------------- ## step 1: estimate gamma for this loop and call it 'temp_gamma' ### create a vector of k_i/ (k_sat) so that you can feed it to natural logarithm and sum over elements ### k_i is each observed node degree in your data; k_sat refers to the k_sat of this loop (nothing to fill) temp_vec_k_i = temp_df\$degree/(k_sat_temp) ### now use the above vector in (4.41); remember N is the number of nodes in your network. N = nrow(my_df) (nothing to fill) temp_gamma = 1 + (nrow(temp_df) / sum(log(temp_vec_k_i)) - 1/2) ## (4.41) ##---------------------------------------------------------------------------- ## step 2: now use (temp_gamma, k_sat_temp) to write (4.43) as a function called 'CDF_k' to pass the KS test in step 3: ### k will be a variable that KS test will use, so make it an argument of CDF_k; ### put gamma and k_sat of this loop in the body of the function CDF_k = function(k) { ### FILL THIS FUNCTION ACCORDING TO (4.43) return(1 - (gen_zeta(temp_gamma, k) / ...)) } ##---------------------------------------------------------------------------- ## step 3: run KS test and pass the statistic as D to the corresponding column of D_df KS_D = ks.test(temp_df\$degree, ...) ### fill in with function you created above: just pass the function name (without parantheses, or brackets, or quotes) ### * you can take a look here if you couldn't figure it out: https://stats.stackexchange.com/questions/47730/r-defining-a-new-continuous-distribution- function-to-use-with-kolmogorov-smirno D_df[i,'D'] = as.numeric(KS_D\$statistic) ### (nothing to fill) ## let's also store the gamma so that we do not have to compute it again once we have an optimal k_sat (nothing to fill) D_df[i,'gamma'] = temp_gamma
• 6. } ##------------------------------------------------------------------------------ ## step 4: plot D against k_sat find the k_sat that minimizes D, and the corresponding gamma ### let's first take a look at the D_df we have formed (nothing to fill) head(D_df, 10) ### find the optimal k_sat that yields minimum D and call it 'optimal_k_sat' (nothing to fill) optimal_k_sat = D_df[which.min(D_df\$D),'k_sat'] ### let's take a look at the D_df we have formed (nothing to fill) ggplot(D_df %>% drop_na(), aes(x = k_sat, y = D)) + geom_point(size = 3, alpha = .5, color = "purple") + geom_vline(xintercept = optimal_k_sat, size = 1, color = "red") + ggplot2::annotate("text", x = optimal_k_sat + 15, y = as.numeric(quantile(D_df\$D,.85)), label = paste0("Optimal k_sat = ",optimal_k_sat), color = "red") + my_theme + labs(x = "k", y = "D") ### find the D corresponding to 'optimal_k_sat' (nothing to fill) min(D_df\$D) ### find the gamma corresponding to the optimal k_sat and call it 'optimal_gamma' (nothing to fill) (optimal_gamma = D_df[which.min(D_df\$D),'gamma']) ## Discard observations with degree below the best k_sat you found earlier. (nothing to fill) post_data = my_df %>% filter(degree >= optimal_k_sat) ##------------------------------------------------------------------------------ ## let's take a look at the resulting Log-log degree distribution plot for the remaining data-points (nothing to fill) p_post <- ggplot(post_data, aes(x = degree)) + geom_point(stat = 'bin', color = "blue", size = 2.5, bins = 3 * ceiling(log(nrow(post_data))))+ scale_x_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + scale_y_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + labs(x = "Degree", y = "Frequency") + my_theme ## fit a line to the Log-log degree distribution (nothing to fill) p_post + geom_smooth(data = ggplot_build(p_post)\$data[[1]] %>% filter(!is.infinite(y)), ## this will
• 7. take the binned data generated by ggplot to fit the line mapping = aes(x=exp(x), y= exp(y)), method = "lm", se=FALSE, color = "red", size = 0.75, alpha = 0.5) #========================================================================= ===================== # 2. EXERCISE PART 2 - Goodness-of-fit ==================================================== #========================================================================= ===================== ## We are going to create a vector of synthetic sequences of degrees and repeat the process M times ## Usually, M is pretty big, like M = 10,000. For now, let's set M = 100: (nothing to fill) M = 100 ## so let's create a data.frame of M rows, one for every D_synthetic we will generate (nothing to fill) D_gof_df = data.frame(iter = 1:M, D_synthetic = NA) ##------------------------------------------------------------------------------ ## step 1: store the distance you found in part 1 as D_real (nothing to fill) D_real = min(D_df\$D) ##------------------------------------------------------------------------------ ## I. Let's walk through steps 2 and 3 once outside of the loop ##------------------------------------------------------------------------------ ##------------------------------------------------------------------------------ ## step 2: you will need to define the inverse of the CDF function (so that you generate random probability values [0,1] and get degrees back) ### let's write the CDF that best fits the data (we did this in part 1): CDF_k = function(k) { ### FILL THE FUNCTION ACCORDING TO (4.43) return(1 - (gen_zeta(optimal_gamma, k) / ...)) } ### 2.1. Let's define the inverse of your CDF; (nothing to fill) ### if the next line is hard to understand, check here: https://stackoverflow.com/questions/23258482/use-inverse-cdf-to-generate-random-variable-in-r
• 8. #### mini step 2.1.1. this piece of code will create an inverse for you (that searches the interval 0 up to a big higher than the highest observed degree in our data) Inverse = function(f, interval = c(sqrt(min(post_data\$degree)), max(post_data\$degree))){ function(y) uniroot((function(x) f(x) - y), interval = interval, extendInt = "yes")\$root } inverse_CDF = Inverse(CDF_k) ### mini step 2.2.1 let's try it for a couple of numbers (nothing to fill) inverse_CDF(0.4) ## mini step 2.2.2. runif(1) will generate a random real number between 0 and 1. Let's pass that to inverse_CDF rand_p = runif(1) inverse_CDF(rand_p) ### mini step 2.2. let's generate 5 random numbers betwee 0 and 1 and get 5 degrees back from our inverse #### (unfortunately we have to write this complex code because inverse_CDF does not accept a vector; try inverse_CDF(c(0.1, 0.2)). ) rand_p = runif(5) unlist(lapply(rand_p, function(p){inverse_CDF(p)})) ### step 2.2.ok, you know how to generate 5 random degrees. Let's create n random degrees, where n is the number of degrees in our remaining data (nothing to fill) rand_p = runif(nrow(post_data)) rand_deg = unlist(lapply(rand_p, function(p){inverse_CDF(p)})) ## unfortunately this is a bit slow!!! rand_deg = unlist(mclapply(rand_p, function(p){inverse_CDF(p)}, mc.cores = parallel::detectCores() - 1)) ## this will make it a bit faster, but still not great! ##------------------------------------------------------------------------------ ## step 3: use ks.test to get a D_synthetic for the rand_deg you just generated and ### FILL THE ks.test ACCORDING TO THE DISCUSSION ON STEP 3 PART 2 KS_D = ks.test(rand_deg, ...) ## ks.test(first are synthetic degrees, second are real degrees) as.numeric(KS_D\$statistic) ##------------------------------------------------------------------------------ ## II. Now let's write the loop ##------------------------------------------------------------------------------ for (i in 1:M){ ## this may take a while!!! It took me 15 minutes to run M = 100 on a computer with 3.5 GHz 6-Core and 64 GB memory. Lower to M = 20 if necessary
• 9. print(paste0("at %",100 * i/M)) ##------------------------------------------------------------------------------ ## step 2: generate a synthetic (random) sequence of degrees rand_p = ... ### FILL AS WE DID ABOVE rand_deg = unlist(mclapply(rand_p, function(p){inverse_CDF(p)}, mc.cores = parallel::detectCores() - 1)) ## this will make it a bit faster ##------------------------------------------------------------------------------ ## step 3: find the distance between the synthetic sequence and CDF_k and store it # KS_D = ks.test(rand_deg, CDF_k) KS_D = ks.test(..., ...) ### FILL AS WE DID ABOVE ks.test(first are synthetic degrees, second are real degrees) D_gof_df[i,'D_synthetic'] = as.numeric(KS_D\$statistic) } ##------------------------------------------------------------------------------ ## Let's plot the results ### let's take a look at the D_df we have formed ggplot(D_gof_df, aes(x = D_synthetic)) + geom_histogram(bins = 20, color = "white", fill = "green", alpha = 0.9) + geom_vline(xintercept = D_real, size = 1, color = "brown") + my_theme + labs(x = "Distance", y = "Frequency", title = "Synthetic and Real Distances") # IDS 564 - Spring 2023 # Lab 4 R Code - Estimating the Degree Exponent of a Scale-free Network #========================================================================= ===================== # 0. INITIATION ========================================================================== = #========================================================================= ===================== ## You'll need VGAM for the zeta function # install.packages("VGAM") ## When prompted to install from binary version, select no
• 10. library(VGAM) ## You'll need this when calculating goodness of fit # install.packages("parallel") library(parallel) library(ggplot2) library(ggthemes) library(dplyr) library(tidyr) ##------------------------------------------------------------------------------ ## This function will calculate the zeta function for you. You don't need to worry about it! Run it and continue. ## gen_zeta(gamma , shift) will give you a number gen_zeta <- function (gamma, shift = 1, deriv = 0) { deriv.arg <- deriv rm(deriv) if (!is.Numeric(deriv.arg, length.arg = 1, integer.valued = TRUE)) stop("'deriv' must be a single non-negative integer") if (deriv.arg < 0 || deriv.arg > 2) stop("'deriv' must be 0, 1, or 2") if (deriv.arg > 0) return(zeta.specials(Zeta.derivative(gamma, deriv.arg = deriv.arg,
• 11. shift = shift), gamma, deriv.arg, shift)) if (any(special <- Re(gamma) <= 1)) { ans <- gamma ans[special] <- Inf special3 <- Re(gamma) < 1 ans[special3] <- NA special4 <- (0 < Re(gamma)) & (Re(gamma) < 1) & (Im(gamma) == 0) # ans[special4] <- Zeta.derivative(gamma[special4], deriv.arg = deriv.arg, shift = shift) special2 <- Re(gamma) < 0 if (any(special2)) { gamma2 <- gamma[special2] cgamma <- 1 - gamma2 ans[special2] <- 2^(gamma2) * pi^(gamma2 - 1) * sin(pi * gamma2/2) * gamma(cgamma) * Recall(cgamma) } if (any(!special)) { ans[!special] <- Recall(gamma[!special]) } return(zeta.specials(ans, gamma, deriv.arg, shift)) } aa <- 12 ans <- 0
• 12. for (ii in 0:(aa - 1)) ans <- ans + 1/(shift + ii)^gamma ans <- ans + Zeta.aux(shape = gamma, aa, shift = shift) ans[shift <= 0] <- NaN zeta.specials(ans, gamma, deriv.arg = deriv.arg, shift = shift) } ## example: gen_zeta(2.1, 4) ##------------------------------------------------------------------------------ ## The P_k (the CDF) P_k = function(gamma, k, k_sat){ ### fill the function return(1 - ( gen_zeta(gamma, k) / ... )) } ##------------------------------------------------------------------------------ my_theme <- theme_classic() + theme(legend.position = "bottom", legend.box = "horizontal", legend.direction = "horizontal", title = element_text(size = 18), axis.title = element_text(size = 14), axis.text.y = element_text(size = 16), axis.text.x = element_text(size = 16), strip.text = element_text(size = 14), strip.background = element_blank(), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.title = element_text(size = 14), legend.text = element_text(size = 14))
• 13. set.seed(123) #========================================================================= ===================== # 00. LOADING DATA ======================================================================== #========================================================================= ===================== ## Load data - fill the path of the folder where you put the file. If the file is in the same folder as the R code remove the path part. your_path = "your path" pat_citation_deg = read.csv(paste0(your_path, "lab 4 data - sample_patant_citation_deg.csv")) head(pat_citation_deg) tail(pat_citation_deg) summary(pat_citation_deg) ## let's have a look at the Log-log degree distribution plot (nothing to fill) p <- ggplot(pat_citation_deg, aes(x = degree)) + geom_point(stat = 'bin', color = "blue", size = 2.5, bins = 3 * ceiling(log(nrow(pat_citation_deg))))+ scale_x_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + scale_y_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) +
• 14. labs(x = "Degree", y = "Frequency") + my_theme + theme(title = element_text(size = 12 )) ## fit a line to the Log-log degree distribution (nothing to fill) p + geom_smooth(data = ggplot_build(p)\$data[[1]] %>% filter(!is.infinite(y)), ## this will take the binned data generated by ggplot to fit the line mapping = aes(x=exp(x), y= exp(y)), method = "lm", se=FALSE, color = "red", size = 0.75, alpha = 0.5) #========================================================================= ===================== # 1. EXERCISE PART 1 - Estimating Gamma =================================================== #========================================================================= ===================== ## designate the data.frame to be used - and use standardized column names: id, degree (nothing to fill) my_df = pat_citation_deg %>% rename(id = patent_id) ##------------------------------------------------------------------------------ ## you'll write a for loop over individual unique degrees in the data-set to find the corresponding distance D ### let's create a data.frame with one column as each observed degree in our network; ### We will fill the other columns D and gamma as we calculate them in the loop (nothing to fill) D_df = data.frame(k_sat = unique(my_df\$degree), D= NA, gamma = NA) %>% arrange(k_sat)
• 15. ## here you set up the maximum degree to check so that you do not have to do the computation for all degrees ### Let's set it up as 25th percentile of the unique observed degrees. Next line of code does that for you: (nothing to fill) max_degree_to_check = as.numeric(ceiling(quantile(unique(my_df\$degree), 0.25))) ### Now discard the rows of D_df you do not need (that are above the max_degree_to_check). Next line of code does it for you: (nothing to fill) D_df = D_df[D_df\$k_sat < max_degree_to_check,] ### Let's take a look at the distance data.frame we are about to fill in the loop: (nothing to fill) head(D_df) tail(D_df) ## Understand and fill parts of the code in this loop ## I recommend setting i = 1 and running each line of this loop on your own and checking what it gives you. This will help you fill the gaps ##------------------------------------------------------------------------------ ## let's work on the loop for (i in 1:(nrow(D_df))) { ## note: the loop starts slower but will speed up as it progresses. ## let's show the current loop k_sat (so that we can see our progress): (nothing to fill) print(paste0("at %", round(100 * i/nrow(D_df), 2))) k_sat_temp = D_df\$k_sat[i] ##---------------------------------------------------------------------------- ## let's create a temporary copy of the network degree data that contains degrees equal or above
• 16. k_sat_temp: (nothing to fill) temp_df = my_df[my_df\$degree>k_sat_temp,] ##---------------------------------------------------------------------------- ## step 1: estimate gamma for this loop and call it 'temp_gamma' ### create a vector of k_i/ (k_sat) so that you can feed it to natural logarithm and sum over elements ### k_i is each observed node degree in your data; k_sat refers to the k_sat of this loop (nothing to fill) temp_vec_k_i = temp_df\$degree/(k_sat_temp) ### now use the above vector in (4.41); remember N is the number of nodes in your network. N = nrow(my_df) (nothing to fill) temp_gamma = 1 + (nrow(temp_df) / sum(log(temp_vec_k_i)) - 1/2) ## (4.41) ##---------------------------------------------------------------------------- ## step 2: now use (temp_gamma, k_sat_temp) to write (4.43) as a function called 'CDF_k' to pass the KS test in step 3: ### k will be a variable that KS test will use, so make it an argument of CDF_k; ### put gamma and k_sat of this loop in the body of the function CDF_k = function(k) { ### FILL THIS FUNCTION ACCORDING TO (4.43) return(1 - (gen_zeta(temp_gamma, k) / ...)) } ##---------------------------------------------------------------------------- ## step 3: run KS test and pass the statistic as D to the corresponding column of D_df
• 17. KS_D = ks.test(temp_df\$degree, ...) ### fill in with function you created above: just pass the function name (without parantheses, or brackets, or quotes) ### * you can take a look here if you couldn't figure it out: https://stats.stackexchange.com/questions/47730/r-defining-a-new-continuous-distribution- function-to-use-with-kolmogorov-smirno D_df[i,'D'] = as.numeric(KS_D\$statistic) ### (nothing to fill) ## let's also store the gamma so that we do not have to compute it again once we have an optimal k_sat (nothing to fill) D_df[i,'gamma'] = temp_gamma } ##------------------------------------------------------------------------------ ## step 4: plot D against k_sat find the k_sat that minimizes D, and the corresponding gamma ### let's first take a look at the D_df we have formed (nothing to fill) head(D_df, 10) ### find the optimal k_sat that yields minimum D and call it 'optimal_k_sat' (nothing to fill) optimal_k_sat = D_df[which.min(D_df\$D),'k_sat'] ### let's take a look at the D_df we have formed (nothing to fill) ggplot(D_df %>% drop_na(), aes(x = k_sat, y = D)) + geom_point(size = 3, alpha = .5, color = "purple") + geom_vline(xintercept = optimal_k_sat, size = 1, color = "red") + ggplot2::annotate("text", x = optimal_k_sat + 15, y = as.numeric(quantile(D_df\$D,.85)), label = paste0("Optimal k_sat = ",optimal_k_sat), color = "red") +
• 18. my_theme + labs(x = "k", y = "D") ### find the D corresponding to 'optimal_k_sat' (nothing to fill) min(D_df\$D) ### find the gamma corresponding to the optimal k_sat and call it 'optimal_gamma' (nothing to fill) (optimal_gamma = D_df[which.min(D_df\$D),'gamma']) ## Discard observations with degree below the best k_sat you found earlier. (nothing to fill) post_data = my_df %>% filter(degree >= optimal_k_sat) ##------------------------------------------------------------------------------ ## let's take a look at the resulting Log-log degree distribution plot for the remaining data-points (nothing to fill) p_post <- ggplot(post_data, aes(x = degree)) + geom_point(stat = 'bin', color = "blue", size = 2.5, bins = 3 * ceiling(log(nrow(post_data))))+ scale_x_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + scale_y_continuous(trans = "log", breaks = scales::trans_breaks("log", function(x) exp(x)), labels = scales::trans_format("log", scales::math_format(e^.x))) + labs(x = "Degree", y = "Frequency") + my_theme
• 19. ## fit a line to the Log-log degree distribution (nothing to fill) p_post + geom_smooth(data = ggplot_build(p_post)\$data[[1]] %>% filter(!is.infinite(y)), ## this will take the binned data generated by ggplot to fit the line mapping = aes(x=exp(x), y= exp(y)), method = "lm", se=FALSE, color = "red", size = 0.75, alpha = 0.5) #========================================================================= ===================== # 2. EXERCISE PART 2 - Goodness-of-fit ==================================================== #========================================================================= ===================== ## We are going to create a vector of synthetic sequences of degrees and repeat the process M times ## Usually, M is pretty big, like M = 10,000. For now, let's set M = 100: (nothing to fill) M = 100 ## so let's create a data.frame of M rows, one for every D_synthetic we will generate (nothing to fill) D_gof_df = data.frame(iter = 1:M, D_synthetic = NA) ##------------------------------------------------------------------------------ ## step 1: store the distance you found in part 1 as D_real (nothing to fill) D_real = min(D_df\$D) ##------------------------------------------------------------------------------
• 20. ## I. Let's walk through steps 2 and 3 once outside of the loop ##------------------------------------------------------------------------------ ##------------------------------------------------------------------------------ ## step 2: you will need to define the inverse of the CDF function (so that you generate random probability values [0,1] and get degrees back) ### let's write the CDF that best fits the data (we did this in part 1): CDF_k = function(k) { ### FILL THE FUNCTION ACCORDING TO (4.43) return(1 - (gen_zeta(optimal_gamma, k) / ...)) } ### 2.1. Let's define the inverse of your CDF; (nothing to fill) ### if the next line is hard to understand, check here: https://stackoverflow.com/questions/23258482/use-inverse-cdf-to-generate-random-variable-in-r #### mini step 2.1.1. this piece of code will create an inverse for you (that searches the interval 0 up to a big higher than the highest observed degree in our data) Inverse = function(f, interval = c(sqrt(min(post_data\$degree)), max(post_data\$degree))){ function(y) uniroot((function(x) f(x) - y), interval = interval, extendInt = "yes")\$root } inverse_CDF = Inverse(CDF_k) ### mini step 2.2.1 let's try it for a couple of numbers (nothing to fill)
• 21. inverse_CDF(0.4) ## mini step 2.2.2. runif(1) will generate a random real number between 0 and 1. Let's pass that to inverse_CDF rand_p = runif(1) inverse_CDF(rand_p) ### mini step 2.2. let's generate 5 random numbers betwee 0 and 1 and get 5 degrees back from our inverse #### (unfortunately we have to write this complex code because inverse_CDF does not accept a vector; try inverse_CDF(c(0.1, 0.2)). ) rand_p = runif(5) unlist(lapply(rand_p, function(p){inverse_CDF(p)})) ### step 2.2.ok, you know how to generate 5 random degrees. Let's create n random degrees, where n is the number of degrees in our remaining data (nothing to fill) rand_p = runif(nrow(post_data)) rand_deg = unlist(lapply(rand_p, function(p){inverse_CDF(p)})) ## unfortunately this is a bit slow!!! rand_deg = unlist(mclapply(rand_p, function(p){inverse_CDF(p)}, mc.cores = parallel::detectCores() - 1)) ## this will make it a bit faster, but still not great! ##------------------------------------------------------------------------------ ## step 3: use ks.test to get a D_synthetic for the rand_deg you just generated and ### FILL THE ks.test ACCORDING TO THE DISCUSSION ON STEP 3 PART 2 KS_D = ks.test(rand_deg, ...) ## ks.test(first are synthetic degrees, second are real degrees) as.numeric(KS_D\$statistic)
• 22. ##------------------------------------------------------------------------------ ## II. Now let's write the loop ##------------------------------------------------------------------------------ for (i in 1:M){ ## this may take a while!!! It took me 15 minutes to run M = 100 on a computer with 3.5 GHz 6-Core and 64 GB memory. Lower to M = 20 if necessary print(paste0("at %",100 * i/M)) ##------------------------------------------------------------------------------ ## step 2: generate a synthetic (random) sequence of degrees rand_p = ... ### FILL AS WE DID ABOVE rand_deg = unlist(mclapply(rand_p, function(p){inverse_CDF(p)}, mc.cores = parallel::detectCores() - 1)) ## this will make it a bit faster ##------------------------------------------------------------------------------ ## step 3: find the distance between the synthetic sequence and CDF_k and store it # KS_D = ks.test(rand_deg, CDF_k) KS_D = ks.test(..., ...) ### FILL AS WE DID ABOVE ks.test(first are synthetic degrees, second are real degrees) D_gof_df[i,'D_synthetic'] = as.numeric(KS_D\$statistic) } ##------------------------------------------------------------------------------ ## Let's plot the results ### let's take a look at the D_df we have formed