~fshodan/bnfinder/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Copyright (c) 2004-2007 Bartek Wilczynski and Norbert Dojer; All Rights Reserved.
#
# This software is distributable under the terms of the GNU
# General Public License (GPL) v2, the text of which can be found at
# http://www.gnu.org/copyleft/gpl-2.0.txt. Installing, importing or otherwise
# using this module constitutes acceptance of the terms of this License.
#
# Disclaimer
# 
# This software is provided "as-is".  There are no expressed or implied
# warranties of any kind, including, but not limited to, the warranties
# of merchantability and fittness for a given application.  In no event
# shall the authors be liable for any direct, indirect, incidental,
# special, exemplary or consequential damages (including, but not limited
# to, loss of use, data or profits, or business interruption) however
# caused and on any theory of liability, whether in contract, strict
# liability or tort (including negligence or otherwise) arising in any way
# out of the use of this software, even if advised of the possibility of
# such damage.
#

import stats,math,util

def linspace(minv,maxv,num):
    """
    """
    res=[]
    step=1.0*(maxv-minv)/(num-1)
    v=minv
    while v<=maxv:
        res.append(v)
        v+=step
    return res

class Transform:
    """general (vritual) class for continuous data transformations
    """
    def __init__(self,*args,**kwds):
        """Initialize general parameter of a transformation
        """
        pass
    def estimate(self,vector):
        """Estimate parameters of transformation dependent on input data

        estimates parameters and seves them internally.
        Returns serialized parameters for use with .fromParams method
        """
        pass
    
    def transform(self,vector):
        """Transform a vector of data using internal parameters
        """
        pass
    
    def estTrans(self,vector):
        """Estimate transformation parameters and transform data in one go.

        Only the transformed data are returned. 
        """
        self.estimate(vector)
        return self.transform(vector)
    
    def fromParams(self,params):
        """Fill in internal parameters using serialized data.
        """
        pass
    def getParams(self):
        """Return serialized parameters.
        """
        pass

class BiNormTransform(Transform):
    def __init__(self,t=1.0):
        self.t=t
        
    def transform(self,l):
        trans=lambda x: 1.0/(1.0+(self.p0/self.p1)*math.exp((self.mu1**2-self.mu0**2+2*(self.mu0-self.mu1)*x)/(2*self.t*self.sigma**2)))
        ret=[]
        for x in l:
            try:
                ret.append(trans(x))
            except OverflowError:
                #print "ERROR",x
                ret.append(0.0)
        return ret

    def fromParams(self,p):
        self.mu0,self.mu1,self.sigma,self.p0,self.p1=p
        
    def getParams(self):
        return self.mu0, self.mu1, self.sigma, self.p0, self.p1

    
class BiNormSimple(BiNormTransform):
    def estimate(self,l):
        mu=stats.mean(l)
        self.sigma=stats.stdev(l)/2
        self.mu0=mu-self.sigma
        self.mu1=mu+self.sigma
        self.p0=self.p1=0.5
    
        return self.mu0, self.mu1, self.sigma, self.p0, self.p1

class BiNormMeans(BiNormTransform):
    """Simple estimation of mixture of two gaussians

    Adapted from previous code in 
    """
    def estimate(self,l):
        def sumvar(l,m):
            return stats.sum(map(lambda x:(x-m)**2,l))
        med=stats.lmedianscore(l)
        oldmed=med+1
        while med!=oldmed:
            l0=filter(lambda x: x<med,l)
            l1=filter(lambda x: x>med,l)
            lm=filter(lambda x: x==med,l)
            mu0=stats.mean(l0*2+lm)
            mu1=stats.mean(l1*2+lm)
            (oldmed,med)=(med,(mu0+mu1)/2)
            #print mu0,mu1, oldmed,med#,l0,l1
        self.sigma=max(0.1**10, math.sqrt((sumvar(l0+lm,mu0)+sumvar(l1,mu1)+(mu1-med)**2)/(len(l))))
        self.p0=(len(l0)+len(lm)*0.5)/len(l)
        self.p1=(len(l1)+len(lm)*0.5)/len(l)
        self.mu0=mu0
        self.mu1=mu1
        return self.mu0, self.mu1, self.sigma, self.p0, self.p1


class GaussianMixture(Transform):
    """ Implements a mixture of N Gaussian variables. Estimation done with EM algorithm.

    default N==2, we assume a common variance"""
    def __init__(self,n=2):
        self.n=n
        

    def _Estep(self,mus,vector):
        #assign the nearest mu to each observation
        import collections
        obs=collections.defaultdict(list)
        for v in vector:
            m,val=util.bin_search(v,mus)
            obs[m].append(v)
        return obs
        
    def _Mstep(self,obs):
        #find best self.mus given self.assign
        mus=[]
        for i in range(self.n):
            mus.append(stats.mean(obs[i]))
        return mus
    
    def estimate(self,vector):
        sv=sorted(vector)
        minv=sv[0]
        maxv=sv[-1]
        self.mus=linspace(minv,maxv,self.n+1)
        self.mus=[(x+self.mus[i+1])/2 for i,x in list(enumerate(self.mus))[:-1]]
        self.obs=self._Estep(self.mus,sv)
        mus=self._Mstep(self.obs)
        while mus!=self.mus:
            self.mus=mus #update means
            self.obs=self._Estep(mus,sv) #update observation assignment
            mus=self._Mstep(self.obs) # calculate new mus
        #estimate variance
        self.var=0.0
        for i,mu in enumerate(self.mus):
            for x in self.obs[i]:
                self.var+=(x-mu)**2
        self.var/=len(vector)
        #estimate priors
        self.priors={}
        for x,l in self.obs.items():
            self.priors[x]=len(l)*1.0/len(vector)
        return self.mus,self.var,self.priors
            
    def transform(self,vector):
        """Transformation using multiple gaussian mixture. as of yet unfinished, requires code changes in data.py and score.py
        #TODO: adapt data.py and score.py to allow more general gaussian mixtures
        """

        result=[]
        for v in vector:
            pass
        #trans=lambda x: 1.0/(1.0+(self.p0/self.p1)*math.exp((self.mu1**2-self.mu0**2+2*(self.mu0-self.mu1)*x)/(2*self.t*self.sigma**2)))
        #return map(trans,l)