This Jupyter notebook contains a comprehensive workflow for analyzing space density and determining cluster membership in astrophysical data, primarily focusing on the properties and movements within galaxies. The notebook starts by importing necessary libraries for data manipulation, statistical analysis, and visualization, including pandas, matplotlib, scipy, and scikit-image. It then defines several functions for edge detection, normalization, and filters such as Sobel and Frangi for image processing to assist in identifying clusters. Using a combination of KDE (Kernel Density Estimation), DBSCAN, and HDBSCAN for clustering, the notebook meticulously processes astronomical data to identify boundary regions of clusters and assigns cluster memberships to individual data points. Lastly, it visualizes the clusters and saves the clustered data into a CSV file, highlighting the use of computational techniques in understanding complex astrophysical phenomena.
from fidanka.fiducial.fiducial import hull_density import pandas as pd import matplotlib.pyplot as plt import os import numpy as np from tqdm.notebook import tqdm from scipy import signal from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram, fcluster from scipy.ndimage import minimum_filter, maximum_filter, sobel from skimage.util import invert from itertools import combinations from dtaidistance import dtw from astroML.correlation import two_point from scipy.stats import gaussian_kde import pickle as pkl from mplEasyAnimate import animation from skimage.filters import threshold_otsu from skimage import data, io, filters from skimage.feature import hessian_matrix, hessian_matrix_eigvals from sklearn.cluster import DBSCAN from sklearn.cluster import HDBSCAN from skimage.filters import frangi from scipy.spatial import ConvexHull, QhullError from scipy.ndimage import binary_erosion from shapely.geometry import Polygon, Point from scipy.interpolate import interp1d from scipy.ndimage import gaussian_filter1d from scipy.spatial import distance_matrix from scipy.sparse.csgraph import minimum_spanning_tree from scipy.sparse import csr_matrix plt.style.use(pubStyle)
def find_edge_points(cluster_points, grid_shape): """ Identify the edge points of a cluster on a grid. Parameters: cluster_points (list of tuples): List of (x, y) tuples representing points in the cluster. grid_shape (tuple): The shape of the grid (height, width). Returns: list: A list of (x, y) tuples representing the edge points of the cluster. """ grid = np.zeros(grid_shape, dtype=bool) for x, y in cluster_points: grid[y, x] = True # Mark the point as part of the cluster eroded_grid = binary_erosion(grid) # Edge points are original grid points minus eroded points edge_grid = grid & ~eroded_grid edge_points = np.argwhere(edge_grid) return np.array([(x, y) for y, x in edge_points])
def create_mapping_function(input_range, output_range): """ Create a function to map a value from an input range to an output range. Parameters: input_range (tuple): The range of input values (min, max). output_range (tuple): The range of output values (min, max). Returns: function: A function that converts values from the input range to the output range. """ input_min, input_max = input_range output_min, output_max = output_range scale = (output_max - output_min) / (input_max - input_min) offset = output_min - input_min * scale def mapping_function(x): return x * scale + offset return mapping_function
def apply_frangi_filter(image): processed_image = frangi(image, scale_range=(1, 3), scale_step=1, beta=0.3, gamma=0.25) return processed_image
def apply_sobel_edge_detection(image): dx = sobel(image, axis=0, mode='reflect') dy = sobel(image, axis=1, mode='reflect') magnitude = np.sqrt(dx**2 + dy**2) return magnitude
def local_normalize(image, size=3): footprint = np.ones((size, size)) local_min = minimum_filter(image, footprint=footprint, mode='reflect') local_max = maximum_filter(image, footprint=footprint, mode='reflect') denom = local_max - local_min denom[denom == 0] = 1 # Prevent division by zero normalized_image = (image - local_min) / denom return normalized_image
def boolean_to_scatter(bool_array, x_array, y_array): y_indices, x_indices = np.where(bool_array) scatter_x = x_array[x_indices] scatter_y = y_array[y_indices] return scatter_x, scatter_y
class BoundaryRegion: def __init__(self, points, xRange = lambda x: x, yRange = lambda y: y): self.rawPoints = np.array(points) # must be of shape mxn where m is the number of points and n is the number of dimensions self.affinePoints = np.array([[xRange(x), yRange(y)] for x, y in zip(points[:, 0], points[:, 1])]) self.sort_points() self.polygon = Polygon(self.points) self.tck = None self.u = None def sort_points(self): """ Sort points to form a valid polygon using nearest neighbor heuristic. """ if len(self.rawPoints) < 3: return # Not enough points to form a polygon sorted_points = [self.rawPoints[0]] # Iterate to find the nearest unvisited point current_point = sorted_points[0] visitedIndexs = [0] while True: dx = (self.rawPoints[:, 0] - current_point[0]) dy = (self.rawPoints[:, 1] - current_point[1]) distances = np.sqrt(dx**2 + dy**2) distances[visitedIndexs] = np.inf nearest_index = np.argmin(distances) if nearest_index not in visitedIndexs: visitedIndexs.append(nearest_index) current_point = self.rawPoints[nearest_index] sorted_points.append(current_point) else: break self.rawPoints = np.array(sorted_points) self.affinePoints = self.affinePoints[visitedIndexs] @property def points(self): return self.affinePoints def is_point_inside(self, test_point): """ Check if a point is inside the polygon defined by the boundary points using shapely. """ point = Point(test_point) return self.polygon.contains(point) def parameterize_boundary(self): """ Parameterize the boundary using spline interpolation. """ t = np.linspace(0, 1, len(self.points)) x, y = self.points[:, 0], self.points[:, 1] self.tck, self.u = interp1d(t, x, kind='cubic'), interp1d(t, y, kind='cubic') return self.tck, self.u def get_point_on_boundary(self, t): """ Get a point on the parameterized boundary. """ if self.tck is None or self.u is None: self.parameterize_boundary() x = self.tck(t) y = self.u(t) return x, y def plot_region(self, ax): """ Plot the region on a given matplotlib axis. """ from matplotlib.patches import Polygon as MplPolygon polygon = MplPolygon(self.points, closed=True, edgecolor='blue', facecolor='none') ax.add_patch(polygon)
df = pd.read_csv("./params_actions_7kms.csv")
fig, ax = plt.subplots(1, 1, figsize=(10, 7)) x = df.lz_ref y = df.sqrtjR_ref cond = (x > 500) & (y > 6) x = x[cond] y = y[cond] ax.scatter(x, y, s=1, alpha=0.1)
<matplotlib.collections.PathCollection at 0x7f4bca5a91c0>
n = 100 if not os.path.exists(f"./density.{n}.pkl"): fdensity = hull_density(x, y, n=n) else: with open(f"./density.{n}.pkl", 'rb') as f: fdensity = pkl.load(f)
# Perform KDE xMin, xMax = x.min(), x.max() yMin, yMax = y.min(), y.max() print(xMin, xMax, yMin, yMax) kde = gaussian_kde([x, y], bw_method='silverman') xGrid, yGrid = np.meshgrid(np.linspace(xMin, xMax, 100), np.linspace(yMin, yMax, 100)) density = kde(np.vstack([xGrid.ravel(), yGrid.ravel()])).reshape(xGrid.shape) # Plotting plt.figure(figsize=(10, 8)) plt.imshow(np.sqrt(density), origin='lower', aspect='auto', extent=[xMin, xMax, yMin, yMax]) plt.colorbar(label='Density') plt.show()
511.689560549884 2301.47274807968 6.00002269148323 14.141994215844566
s = 10 normDensity = local_normalize(density, size=s) densityTarget = invert(normDensity)**2 vesselnessImage = apply_frangi_filter(densityTarget) tVesselnessImage = np.ones_like(vesselnessImage) tVesselnessImage[vesselnessImage < 0.03] = 0 fig, axs = plt.subplots(1,2,figsize=(20, 7)) densityCMAP = axs[0].imshow(vesselnessImage, origin='lower', aspect='auto', extent=[xMin, xMax, yMin, yMax]) axs[1].imshow(tVesselnessImage, origin='lower', aspect='auto', extent=[xMin, xMax, yMin, yMax]) plt.colorbar(densityCMAP)
/tmp/ipykernel_51046/4230504841.py:2: UserWarning: Use keyword parameter `sigmas` instead of `scale_range` and `scale_range` which will be removed in version 0.17. processed_image = frangi(image, scale_range=(1, 3), scale_step=1, beta=0.3, gamma=0.25) <matplotlib.colorbar.Colorbar at 0x7f4c099bb730>
xThresh, yThresh = boolean_to_scatter(tVesselnessImage, np.arange(0, 101, 1, dtype=int), np.arange(0, 101, 1, dtype=int)) coordinates = np.column_stack((xThresh, yThresh)) xRange = create_mapping_function((0, 100), (xMin, xMax)) yRange = create_mapping_function((0, 100), (yMin, yMax)) grid = np.arange(0, 101, 1, dtype=int) dbscan = HDBSCAN() clusters = dbscan.fit_predict(coordinates) boundaries = list() for cluster in set(clusters): if cluster == -1: continue clusterX = coordinates[:, 0][clusters == cluster] clusterY = coordinates[:, 1][clusters == cluster] clusterPoints = np.column_stack((clusterX, clusterY)) if len(clusterPoints > 5): try: vertices = find_edge_points(np.vstack((clusterX, clusterY)).T, (100, 100)) xVert = grid[vertices[:, 0]] yVert = grid[vertices[:, 1]] bPoints = np.vstack((xVert, yVert)).T boundaries.append(BoundaryRegion(bPoints, xRange=xRange, yRange=yRange)) except QhullError: print(f"Skipping Cluster {cluster} due to QHull Error (such as geometry degeneracy)") # fig.savefig("WrinkDetectBinCollapse.png")
fig, ax = plt.subplots(1, 1, figsize=(10, 7)) ax.scatter(x, y, s=1) for region in boundaries: region.plot_region(ax)
clusterMembers = {} for clusterID, region in tqdm(enumerate(boundaries), total=len(boundaries)): for point in tqdm(np.vstack([x, y]).T): if region.is_point_inside(point): if clusterID not in clusterMembers: clusterMembers[clusterID] = list() clusterMembers[clusterID].append(point)
0%| | 0/20 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s] 0%| | 0/227933 [00:00<?, ?it/s]
fig, ax = plt.subplots(1, 1, figsize=(10, 7)) ax.scatter(x, y, s=1) for clusterID, members in clusterMembers.items(): mem = np.array(members) print(mem.shape) ax.scatter(mem[:, 0], mem[:, 1], color=f"C{clusterID}")
(2, 2) (2, 2) (1, 2) (3, 2) (5, 2) (55772, 2) (322, 2) (596, 2) (33, 2) (2, 2) (7, 2) (20, 2) (272, 2) (13170, 2) (13214, 2) (18, 2) (427, 2) (9624, 2)
df
x = df.lz_ref
y = df.sqrtjR_ref
df['cluster'] = -1 for clusterIndex, points in tqdm(clusterMembers.items(), total=len(boundaries)): for point in tqdm(points, leave=False): lzRef, sqrtjRRef = point # Find the index of the point in the DataFrame mask = (df['lz_ref'] == lzRef) & (df['sqrtjR_ref'] == sqrtjRRef) df.loc[mask, 'cluster'] = clusterIndex
0%| | 0/20 [00:00<?, ?it/s] 0%| | 0/2 [00:00<?, ?it/s] 0%| | 0/2 [00:00<?, ?it/s] 0%| | 0/1 [00:00<?, ?it/s] 0%| | 0/3 [00:00<?, ?it/s] 0%| | 0/5 [00:00<?, ?it/s] 0%| | 0/55772 [00:00<?, ?it/s] 0%| | 0/322 [00:00<?, ?it/s] 0%| | 0/596 [00:00<?, ?it/s] 0%| | 0/33 [00:00<?, ?it/s] 0%| | 0/2 [00:00<?, ?it/s] 0%| | 0/7 [00:00<?, ?it/s] 0%| | 0/20 [00:00<?, ?it/s] 0%| | 0/272 [00:00<?, ?it/s] 0%| | 0/13170 [00:00<?, ?it/s] 0%| | 0/13214 [00:00<?, ?it/s] 0%| | 0/18 [00:00<?, ?it/s] 0%| | 0/427 [00:00<?, ?it/s] 0%| | 0/9624 [00:00<?, ?it/s]
df.to_csv("params_actions_7kms_clustered.csv", index=False)
python