Coverage for src/serums/error_processor.py: 0%

30 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-11 16:43 +0000

1"""For preprocessing errors.""" 

2import numpy as np 

3from scipy.stats import ks_2samp, cramervonmises_2samp, anderson_ksamp 

4import serums.models 

5import pandas as pd 

6 

7 

8class BinnedError: 

9 """Class to represent binned error distribution. 

10 This class must be given truth data, measured data, dependency arrays, an I/O mapping function, and preferences. 

11 

12 Attributes 

13 ---------- 

14 truth : array_like 

15 Truth data. 

16 measured : array_like 

17 Measured data. 

18 dependencies : array_like 

19 Dependency arrays. 

20 io_map : function 

21 I/O mapping function. 

22 preferences : dict 

23 Preferences.  

24 """ 

25 

26 # Initialize binned error distribution 

27 def __init__(self, truth, measured, dependencies, io_map, preferences): 

28 self.truth = truth 

29 self.measured = measured 

30 self.dependencies = dependencies 

31 self.io_map = io_map 

32 self.preferences = preferences 

33 

34 # Function to normalize error distribution by subtracting the truth from the measured data 

35 def normalize(self): 

36 self.error = self.measured - self.truth 

37 return self.error 

38 

39 # Function to remove obvious outliers from error distribution by removing errors more than 10 standard deviations from the mean 

40 def remove_outliers(self): 

41 self.error = self.error[np.abs(self.error) < 10 * np.std(self.error)] 

42 return self.error 

43 

44 # Function to recursively bin error distribution by checking for autocorrelation in the error distribution with the dependency array, and bisecting the error distribution if there is autocorrelation 

45 def autobin(self, error=None): 

46 # Check for autocorrelation in error distribution 

47 if not error: 

48 error = self.error 

49 if np.any(np.corrcoef(error, self.dependencies) > 0.1): 

50 # Bisect error distribution, and recursively call autobin on each bin 

51 bins = self.bisect(error) 

52 for bin in bins: 

53 self.autobin(bin) 

54 else: 

55 # Store error distribution as a list of bins 

56 self.bins = [error] 

57 return self.bins 

58 

59 # Function to bisect error distribution into 2 bins at its midpoint and store the result as a list of bins 

60 def bisect(self, error): 

61 midpoint = int(len(error) / 2) 

62 bins = [error[:midpoint], error[midpoint:]] 

63 return bins