diff --git a/prefpy/GaussianRUMGMM.py b/prefpy/GaussianRUMGMM.py new file mode 100644 index 0000000..f3fc219 --- /dev/null +++ b/prefpy/GaussianRUMGMM.py @@ -0,0 +1,107 @@ +from scipy.stats import norm, rankdata +import time +import math +import numpy as np +import pytz +import pandas as pd +from numpy.linalg import inv, pinv + +#probability of one alternative is preferred to another. +#x is the difference between the means of the two alternatives +def prPair(x): + return norm.cdf(x, 0, scale=math.sqrt(2)) + +#derivative of probability +def dPrPair(x): + return np.exp(-x ** 2/4)/(2 * math.sqrt(math.pi)) + +#second order derivative of probability +def ddPrPair(x): + return -x * np.exp(- x ** 2/4)/(4 * math.sqrt(math.pi)) + +#calculate the breakings +def dataBreaking(data, m, normalize): + breaking = np.zeros((m, m), int) + n = len(data) + for d in data: + for i in range(0, m): + for j in range(i+1, m): + breaking[d[i], d[j]] += 1 + if normalize: + return breaking/n + else: + return breaking + +#for debugging purpose +def trueBreaking(Mu): + m=len(Mu) + A=np.zeros((m,m), float) + + for i in range(0,m): + for j in range(0,m): + x = Mu[i] - Mu[j] + A[i,j] = norm.cdf(x, 0, scale=math.sqrt(2)) + np.fill_diagonal(A,0) + return A + +#To calculate the gradient of objective function +def gradientPrPair(breaking, theta, n): + theta = theta[0] + m = len(theta) + grad = np.zeros((m, 1), float) + for i in range(0, m): + for j in range(0, m): + if j != i: + x = theta[i] - theta[j] + grad[i] += 2 * (breaking[i][j] - n * prPair(x)) * dPrPair(x) + return grad + +#To calculate the Hessian matrix +def hessianPrPair(breaking, theta, n): + theta = theta[0] + m = len(theta) + hessian = np.zeros((m, m), float) + for i in range(0, m): + for j in range(0, m): + if j != i: + x = theta[i] - theta[j] + hessian[i][i] += 2*(breaking[i][j]-n*prPair(x))*ddPrPair(x) - 2*n*(dPrPair(x))**2 + if j > i: + hessian[i][j] = 2*n*dPrPair(x)**2-2*ddPrPair(breaking[i][j]-n*prPair(x)) + else: + hessian[i][j] = hessian[j][i] + return hessian + +#' GMM Method for Estimating Random Utility Model wih Normal dsitributions +#' +#' @param Data.pairs data broken up into pairs +#' @param m number of alternatives +#' @param itr number of itrations to run +#' @param Var indicator for difference variance (default is FALSE) +#' @param prior magnitude of fake observations input into the model +#' @return Estimated mean parameters for distribution of underlying normal (variance is fixed at 1) +#' @export +#' @examples +#' data(Data.Test) +#' Data.Test.pairs <- Breaking(Data.Test, "full") +#' Estimation.Normal.GMM(Data.Test.pairs, 5) +def GMMGaussianRUM(data, m, n, itr=1): + + t0 = time.time() #get starting time + + muhat = np.ones((1, m), float) + breaking = dataBreaking(data, m, False) + + for itr in range(1,itr + 1): + try: + Hinv = np.linalg.pinv(hessianPrPair(breaking, muhat, n)) + except np.linalg.linalg.LinAlgError: + Hinv = 0.01 * np.identity(m) + Grad = gradientPrPair(breaking, muhat, n) + muhat = (muhat.transpose() - np.dot(Hinv, Grad)).transpose() + muhat = muhat - muhat.min() + + t = time.time() - t0 + print("Time used:", t) + + return muhat diff --git a/prefpy/GaussianRUMtest.py b/prefpy/GaussianRUMtest.py new file mode 100644 index 0000000..c03faa7 --- /dev/null +++ b/prefpy/GaussianRUMtest.py @@ -0,0 +1,28 @@ +from generate import * +from likelihood_rum import * +from GaussianRUMGMM import * +#from scipy.stats import norm + +if __name__ == '__main__': + + m = 4 + n = 1000 + ntrials = 100 + sse = 0 + + for t in range(0, ntrials): + print("Trial: ", t) + Params = GenerateRUMParameters(m, "normal") + Data = GenerateRUMData(Params,m,n,"normal") + GroundTruth = Params["Mean"] + GroundTruth = GroundTruth - GroundTruth.min() + mu = GMMGaussianRUM(Data, m, n, itr=1) + mu = mu - mu.min() + se = np.square(GroundTruth - mu[0]).sum() + print("Ground Truth: ", GroundTruth) + print("Estimate: ", mu) + print("Error: ", se) + sse += se + + mse = sse/ntrials + print("MSE: ", mse) diff --git a/prefpy/generate.py b/prefpy/generate.py new file mode 100644 index 0000000..0def9b6 --- /dev/null +++ b/prefpy/generate.py @@ -0,0 +1,140 @@ +import numpy as np +from scipy.stats import rankdata + +def GenerateRUMParameters(m, distribution): + arr = [] + for i in range(0, m): + arr.append(1) + if distribution=='normal': + parameter = dict(m = m, Mean = np.random.uniform(0,1,(m,)), SD = np.array(arr)) + parameter["order"] = parameter["Mean"].ravel().argsort() + elif distribution=='exponential': + unscaled = random.uniform(0,1,(m,)) + parameter = dict(m = m, Mean = unscaled/unscaled.sum()) + else: + raise ValueError("Distribution name \"", distribution, "\" not recognized") + return parameter + +#' Generate observation of ranks given parameters +#' +#' Given a list of parameters (generated via the Generate RUM Parameters function), +#' generate random utilities from these models and then return their ranks +#' +#' @param Params inference object from an Estimation function, or parameters object from a generate function +#' @param m number of alternatives +#' @param n number of agents +#' @param distribution can be either 'normal' or 'exponential' +#' @return a matrix of observed rankings +#' @export +#' @examples +#' Params = Generate.RUM.Parameters(10, "normal") +#' Generate.RUM.Data(Params,m=10,n=5,"normal") +#' Params = Generate.RUM.Parameters(10, "exponential") +#' Generate.RUM.Data(Params,m=10,n=5,"exponential") +def GenerateRUMData(Params, m, n, distribution): + if distribution == "exponential": + return np.transpose(np.repeat(rankdata(np.random.exponential(size = m, scale = 1/Params["Mean"])), n)) + elif distribution == "normal": + A = rankdata(-np.random.normal(size = m, loc = Params["Mean"], scale = Params["SD"]))-1 + for i in range(0, n - 1): + A = np.vstack([A, (-np.random.normal(size = m, loc = Params["Mean"], scale = Params["SD"])).ravel().argsort()]) + return A.astype(int) + else: + raise ValueError("Distribution name \"", distribution, "\" not recognized")#' Breaks full or partial orderings into pairwise comparisons +#' +#' Given full or partial orderings, this function will generate pairwise comparison +#' Options +#' 1. full - All available pairwise comparisons. This is used for partial +#' rank data where the ranked objects are a random subset of all objects +#' 2. adjacent - Only adjacent pairwise breakings +#' 3. top - also takes in k, will break within top k +#' and will also generate pairwise comparisons comparing the +#' top k with the rest of the data +#' 4. top.partial - This is used for partial rank data where the ranked +#' alternatives are preferred over the non-ranked alternatives +#' +#' The first column is the preferred alternative, and the second column is the +#' less preferred alternative. The third column gives the rank distance between +#' the two alternatives, and the fourth column enumerates the agent that the +#' pairwise comparison came from. +#' +#' @param Data data in either full or partial ranking format +#' @param method - can be full, adjacent, top or top.partial +#' @param k This applies to the top method, choose which top k to focus on +#' @return Pairwise breakings, where the three columns are winner, loser and rank distance (latter used for Zemel) +#' @export +#' @examples +#' data(Data.Test) +#' Data.Test.pairs <- Breaking(Data.Test, "full") + +def Breaking(Data): + rows = Data.shape[0] + cols = Data.shape[1] + count = 0 + final = np.zeros((int(rows*cols*(cols - 1) / 2), 4), int) + for i in range(0,len(Data)): + current_list = Data[i] + for j in range(0, len(current_list)): + for k in range(j + 1, len(current_list)): + final[count, 0] = current_list[j] + final[count, 1] = current_list[k] + final[count, 2] = k - j + final[count, 3] = i + 1 + count = count + 1 + return final +# m = Data.shape[1] + +# pair_full <- function(rankings) pair.top.k(rankings, length(rankings)) + +# pair.top.k <- function(rankings, k) { +# pair.top.helper <- function(first, rest) { +# pc <- c(); +# z <- length(rest) +# for(i in 1:z) pc <- rbind(pc, array(as.numeric(c(first, rest[i], i)))) +# pc +# } +# if(length(rankings) <= 1 | k <= 0) c() +# else rbind(pair.top.helper(rankings[1], rankings[-1]), pair.top.k(rankings[-1], k - 1)) +# } + +# pair.adj <- function(rankings) { +# if(length(rankings) <= 1) c() +# else rbind(c(rankings[1], rankings[2]), pair.adj(rankings[-1])) +# } + +# pair.top.partial <- function(rankings, m) { +# # this is used in the case when we have missing ranks that we can +# # fill in at the end of the ranking. We can assume here that all +# # ranked items have higher preferences than non-ranked items +# # (e.g. election data) + +# # the number of alternatives that are not missing +# k <- length(rankings) + +# # these are the missing rankings +# missing <- Filter(function(x) !(x %in% rankings), 1:m) + +# # if there is more than one item missing, scramble the rest and place them in the ranking +# if(m - k > 1) missing <- scramble(missing) + +# # now just apply the top k breaking, with the missing elements +# # inserted at the end +# pair.top.k(c(rankings, missing), k) +# } + +# break.into <- function(Data, breakfunction, ...) { +# n <- nrow(Data) +# # applying a Filter(identity..., ) to each row removes all of the missing data +# # this is used in the case that only a partial ordering is provided +# tmp <- do.call(rbind, lapply(1:n, function(row) cbind(breakfunction(Filter(identity, Data[row, ]), ...), row))) +# colnames(tmp) <- c("V1", "V2", "distance", "agent") +# tmp +# } + +# if(method == "full") Data.pairs <- break.into(Data, pair.full) +# if(method == "adjacent") Data.pairs <- break.into(Data, pair.adj) +# if(method == "top") Data.pairs <- break.into(Data, pair.top.k, k = k) +# if(method == "top.partial") Data.pairs <- break.into(Data, pair.top.partial, m = m) + +# Data.pairs +# }