construct-diffusion-map.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. #!/bin/env python3
  2. import ithildin as ith
  3. import numpy as np
  4. import sys
  5. from typing import List
  6. from scipy.interpolate import RectBivariateSpline
  7. from pydiffmap.diffusion_map import DiffusionMap
  8. from pydiffmap import visualization as diff_visualization
  9. import argparse
  10. import pickle
  11. import gc
  12. parser = argparse.ArgumentParser(description="Compute a diffusion map.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  13. parser.add_argument('stem', help='stem for the ithildin simulation data')
  14. parser.add_argument('diffmap', help='file name to save the diffusion map as')
  15. parser.add_argument('-k', metavar='k', type=int, help='number of neirest neighbours to use', default=64)
  16. parser.add_argument('--eps', metavar='k', type=float, default=1)
  17. parser.add_argument('--points', type=int, help='number of sampled points to use', default=2048)
  18. parser.add_argument('--normalize', action=argparse.BooleanOptionalAction, help='normalize data before constructing diffusion maps', default=True)
  19. parser.add_argument('--space-clip-percentage', type=int, help='percentage to ignore in each not-small space dimension (for eliminating border effects)', default=25)
  20. parser.add_argument('--small-treshold', type=int, help='number of grid points in a space dimension from which it is considered non-small', default=20)
  21. parser.add_argument('--start-time', type=int, help='time to start at (percentage-wise)', default=10)
  22. parser.add_argument('--end-time', type=int, help='time to end at (percentage-wise)', default=100)
  23. args = parser.parse_args()
  24. print("Loading simulation data from %s ..." % (args.stem,))
  25. data = ith.SimData.from_stem(args.stem)
  26. def apply_percentage(p, x):
  27. assert(0 <= p)
  28. assert(p <= 100)
  29. assert(0 <= x)
  30. return (p * x)//100 # the exact rounding is expected to only have neglible results
  31. def start_percentage(dimension):
  32. if dimension==0: # time dimension
  33. return args.start_time
  34. else: # space dimension
  35. return args.space_clip_percentage//2
  36. def end_percentage(dimension):
  37. if dimension==0: # time dimension
  38. return args.end_time
  39. else: # space dimension
  40. if data.shape[dimension] <= 1:
  41. return 100
  42. # TODO: different, use round-upwards division elsewhere
  43. else:
  44. return 100 - (args.space_clip_percentage//2)
  45. def make_borders(start_percentage, end_percentage):
  46. shape = list(data.shape)
  47. for i in range(0,len(shape)):
  48. shape[i] = slice(apply_percentage(start_percentage(i),shape[i]), apply_percentage(end_percentage(i),shape[i]))
  49. return tuple(shape)
  50. def remove_all_borders(variable):
  51. slices = make_borders(start_percentage, end_percentage)
  52. return variable[slices]
  53. print('Preprocessing input data ...')
  54. vars = dict()
  55. # Remove borders from each variable.
  56. for key in data.vars.keys():
  57. vars[key] = remove_all_borders(data.vars[key])
  58. # Use the phase space, not the configuration space.
  59. original_number_of_points = None
  60. for key in data.vars.keys():
  61. vars[key] = vars[key].ravel()
  62. original_number_of_points = vars[key].shape[0]
  63. # Reduce the number of points, for computational reasons
  64. np.random.seed(1) # determinism
  65. choices = np.random.choice(original_number_of_points, args.points, replace=False)
  66. for key in data.vars.keys():
  67. vars[key] = vars[key][choices]
  68. del choices
  69. gc.collect() # save some memory
  70. # Normalise data, if requested.
  71. means = dict()
  72. spreads = dict()
  73. for key in data.vars.keys():
  74. means[key] = np.mean(vars[key])
  75. spreads[key] = np.std(vars[key])
  76. # K is constant for some of the models, don't divide by zero
  77. if args.normalize and spreads[key] != 0:
  78. vars[key] -= means[key]
  79. vars[key] /= spreads[key]
  80. gc.collect()
  81. # Compute the diffusion map
  82. dmap = DiffusionMap.from_sklearn(epsilon=args.eps,k=args.k,n_evecs=len(vars.keys()))
  83. variable_names = list(vars.keys())
  84. variable_names.sort()
  85. variable_values = list(variable_names)
  86. for i in range(0,len(variable_names)):
  87. variable_values[i] = vars[variable_names[i]]
  88. variable_values = np.column_stack(tuple(variable_values))
  89. print(variable_values.shape)
  90. #del vars # try reducing peak memory usage (not very effective?)
  91. #gc.collect()
  92. print('Computing the diffusion map ...')
  93. dmap.fit(variable_values)
  94. print('Computed!')
  95. # Save the diffusion map
  96. print(dir(dmap))
  97. dmap_info = dict()
  98. dmap_info['variable_names'] = variable_names
  99. dmap_info['preprocessed_data'] = dmap.data
  100. dmap_info['means'] = means
  101. dmap_info['spreads'] = spreads
  102. dmap_info['alpha'] = dmap.alpha
  103. dmap_info['n_evecs'] = dmap.n_evecs
  104. dmap_info['evecs'] = dmap.evecs
  105. dmap_info['evals'] = dmap.evals
  106. dmap_info['right_norm_vec'] = dmap.right_norm_vec
  107. dmap_info['epsilon_fitted'] = dmap.epsilon_fitted
  108. dmap_info['weights'] = dmap.weights
  109. dmap_info['kernel_matrix'] = dmap.kernel_matrix
  110. dmap_info['L'] = dmap.L
  111. dmap_info['q'] = dmap.q
  112. dmap_info['dmap'] = dmap.dmap
  113. dmap_info['normalize'] = args.normalize
  114. dmap_info['start_time'] = args.start_time
  115. dmap_info['end_time'] = args.end_time
  116. dmap_info['space_clip_percentage'] = args.space_clip_percentage
  117. dmap_info['local_kernel'] = dict(k=dmap.local_kernel.k,neigh=dmap.local_kernel.neigh,epsilon=dmap.local_kernel.epsilon,epsilon_fitted=dmap.local_kernel.epsilon_fitted)
  118. with open(args.diffmap, 'wb') as f:
  119. pickle.dump(dmap_info,f)