Analyzing Space Density and Cluster Membership in Astrophysical Data

ActionSpaceDensity.ipynb

This Jupyter notebook contains a comprehensive workflow for analyzing space density and determining cluster membership in astrophysical data, primarily focusing on the properties and movements within galaxies. The notebook starts by importing necessary libraries for data manipulation, statistical analysis, and visualization, including pandas, matplotlib, scipy, and scikit-image. It then defines several functions for edge detection, normalization, and filters such as Sobel and Frangi for image processing to assist in identifying clusters. Using a combination of KDE (Kernel Density Estimation), DBSCAN, and HDBSCAN for clustering, the notebook meticulously processes astronomical data to identify boundary regions of clusters and assigns cluster memberships to individual data points. Lastly, it visualizes the clusters and saves the clustered data into a CSV file, highlighting the use of computational techniques in understanding complex astrophysical phenomena.

snippet.python
from fidanka.fiducial.fiducial import hull_density
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from tqdm.notebook import tqdm
from scipy import signal
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.ndimage import minimum_filter, maximum_filter, sobel
from skimage.util import invert
from itertools import combinations
from dtaidistance import dtw
from astroML.correlation import two_point
from scipy.stats import gaussian_kde
import pickle as pkl
from mplEasyAnimate import animation
from skimage.filters import threshold_otsu
from skimage import data, io, filters
from skimage.feature import hessian_matrix, hessian_matrix_eigvals
from sklearn.cluster import DBSCAN
from sklearn.cluster import HDBSCAN
from skimage.filters import frangi
from scipy.spatial import ConvexHull, QhullError
from scipy.ndimage import binary_erosion
from shapely.geometry import Polygon, Point
from scipy.interpolate import interp1d
from scipy.ndimage import gaussian_filter1d
from scipy.spatial import distance_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csr_matrix
plt.style.use(pubStyle)
snippet.python
def find_edge_points(cluster_points, grid_shape):
    """
    Identify the edge points of a cluster on a grid.
 
    Parameters:
    cluster_points (list of tuples): List of (x, y) tuples representing points in the cluster.
    grid_shape (tuple): The shape of the grid (height, width).
 
    Returns:
    list: A list of (x, y) tuples representing the edge points of the cluster.
    """
    grid = np.zeros(grid_shape, dtype=bool)
    for x, y in cluster_points:
        grid[y, x] = True  # Mark the point as part of the cluster
 
    eroded_grid = binary_erosion(grid)
 
    # Edge points are original grid points minus eroded points
    edge_grid = grid & ~eroded_grid
    edge_points = np.argwhere(edge_grid)
 
    return np.array([(x, y) for y, x in edge_points])
snippet.python
def create_mapping_function(input_range, output_range):
    """
    Create a function to map a value from an input range to an output range.
 
    Parameters:
    input_range (tuple): The range of input values (min, max).
    output_range (tuple): The range of output values (min, max).
 
    Returns:
    function: A function that converts values from the input range to the output range.
    """
    input_min, input_max = input_range
    output_min, output_max = output_range
 
    scale = (output_max - output_min) / (input_max - input_min)
    offset = output_min - input_min * scale
 
    def mapping_function(x):
        return x * scale + offset
 
    return mapping_function
snippet.python
def apply_frangi_filter(image):
    processed_image = frangi(image, scale_range=(1, 3), scale_step=1, beta=0.3, gamma=0.25)
    return processed_image
snippet.python
def apply_sobel_edge_detection(image):
    dx = sobel(image, axis=0, mode='reflect')
    dy = sobel(image, axis=1, mode='reflect')
 
    magnitude = np.sqrt(dx**2 + dy**2)
 
    return magnitude
snippet.python
def local_normalize(image, size=3):
    footprint = np.ones((size, size))
 
    local_min = minimum_filter(image, footprint=footprint, mode='reflect')
    local_max = maximum_filter(image, footprint=footprint, mode='reflect')
 
    denom = local_max - local_min
    denom[denom == 0] = 1  # Prevent division by zero
    normalized_image = (image - local_min) / denom
 
    return normalized_image
snippet.python
def boolean_to_scatter(bool_array, x_array, y_array):
    y_indices, x_indices = np.where(bool_array)
 
    scatter_x = x_array[x_indices]
    scatter_y = y_array[y_indices]
 
    return scatter_x, scatter_y
snippet.python
class BoundaryRegion:
    def __init__(self, points, xRange = lambda x: x, yRange = lambda y: y):
        self.rawPoints = np.array(points) # must be of shape mxn where m is the number of points and n is the number of dimensions
        self.affinePoints = np.array([[xRange(x), yRange(y)] for x, y in zip(points[:, 0], points[:, 1])]) 
        self.sort_points()
        self.polygon = Polygon(self.points)
        self.tck = None
        self.u = None
 
    def sort_points(self):
        """ Sort points to form a valid polygon using nearest neighbor heuristic. """
        if len(self.rawPoints) < 3:
            return  # Not enough points to form a polygon
 
        sorted_points = [self.rawPoints[0]]
 
        # Iterate to find the nearest unvisited point
        current_point = sorted_points[0]
        visitedIndexs = [0]
        while True:
            dx = (self.rawPoints[:, 0] - current_point[0])
            dy = (self.rawPoints[:, 1] - current_point[1])
            distances = np.sqrt(dx**2 + dy**2)
            distances[visitedIndexs] = np.inf
            nearest_index = np.argmin(distances)
            if nearest_index not in visitedIndexs:
                visitedIndexs.append(nearest_index)
                current_point = self.rawPoints[nearest_index]
                sorted_points.append(current_point)
            else:
                break
 
        self.rawPoints = np.array(sorted_points)
        self.affinePoints = self.affinePoints[visitedIndexs]
 
    @property
    def points(self):
        return self.affinePoints
 
    def is_point_inside(self, test_point):
        """ Check if a point is inside the polygon defined by the boundary points using shapely. """
        point = Point(test_point)
        return self.polygon.contains(point)
 
    def parameterize_boundary(self):
        """ Parameterize the boundary using spline interpolation. """
        t = np.linspace(0, 1, len(self.points))
        x, y = self.points[:, 0], self.points[:, 1]
        self.tck, self.u = interp1d(t, x, kind='cubic'), interp1d(t, y, kind='cubic')
        return self.tck, self.u
 
    def get_point_on_boundary(self, t):
        """ Get a point on the parameterized boundary. """
        if self.tck is None or self.u is None:
            self.parameterize_boundary()
        x = self.tck(t)
        y = self.u(t)
        return x, y
 
    def plot_region(self, ax):
        """ Plot the region on a given matplotlib axis. """
        from matplotlib.patches import Polygon as MplPolygon
        polygon = MplPolygon(self.points, closed=True, edgecolor='blue', facecolor='none')
        ax.add_patch(polygon)
snippet.python
df = pd.read_csv("./params_actions_7kms.csv")
snippet.python
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
x = df.lz_ref
y = df.sqrtjR_ref
cond = (x > 500) & (y > 6)
x = x[cond]
y = y[cond]
ax.scatter(x, y, s=1, alpha=0.1)
<matplotlib.collections.PathCollection at 0x7f4bca5a91c0>

png

snippet.python
n = 100
if not os.path.exists(f"./density.{n}.pkl"):
    fdensity = hull_density(x, y, n=n)
else:
    with open(f"./density.{n}.pkl", 'rb') as f:
        fdensity = pkl.load(f)
snippet.python
# Perform KDE
xMin, xMax = x.min(), x.max()
yMin, yMax = y.min(), y.max()
print(xMin, xMax, yMin, yMax)
kde = gaussian_kde([x, y], bw_method='silverman')
xGrid, yGrid = np.meshgrid(np.linspace(xMin, xMax, 100), np.linspace(yMin, yMax, 100))
density = kde(np.vstack([xGrid.ravel(), yGrid.ravel()])).reshape(xGrid.shape)
# Plotting
plt.figure(figsize=(10, 8))
plt.imshow(np.sqrt(density), origin='lower', aspect='auto', extent=[xMin, xMax, yMin, yMax])
plt.colorbar(label='Density')
plt.show()
511.689560549884 2301.47274807968 6.00002269148323 14.141994215844566

png

snippet.python
s = 10
normDensity  = local_normalize(density, size=s)
densityTarget = invert(normDensity)**2
 
vesselnessImage = apply_frangi_filter(densityTarget)
tVesselnessImage = np.ones_like(vesselnessImage)
tVesselnessImage[vesselnessImage < 0.03] = 0
 
fig, axs = plt.subplots(1,2,figsize=(20, 7))
densityCMAP = axs[0].imshow(vesselnessImage, origin='lower', aspect='auto', extent=[xMin, xMax, yMin, yMax])
axs[1].imshow(tVesselnessImage, origin='lower', aspect='auto', extent=[xMin, xMax, yMin, yMax])
plt.colorbar(densityCMAP)
/tmp/ipykernel_51046/4230504841.py:2: UserWarning: Use keyword parameter `sigmas` instead of `scale_range` and `scale_range` which will be removed in version 0.17.
  processed_image = frangi(image, scale_range=(1, 3), scale_step=1, beta=0.3, gamma=0.25)





<matplotlib.colorbar.Colorbar at 0x7f4c099bb730>

png

snippet.python
xThresh, yThresh = boolean_to_scatter(tVesselnessImage, np.arange(0, 101, 1, dtype=int), np.arange(0, 101, 1, dtype=int))
 
coordinates = np.column_stack((xThresh, yThresh))
xRange = create_mapping_function((0, 100), (xMin, xMax))
yRange = create_mapping_function((0, 100), (yMin, yMax))
grid = np.arange(0, 101, 1, dtype=int)
 
dbscan = HDBSCAN()  
clusters = dbscan.fit_predict(coordinates)
 
boundaries = list()
for cluster in set(clusters):
    if cluster == -1:
        continue
    clusterX = coordinates[:, 0][clusters == cluster]
    clusterY = coordinates[:, 1][clusters == cluster]
    clusterPoints = np.column_stack((clusterX, clusterY))
    if len(clusterPoints > 5):
        try:
            vertices = find_edge_points(np.vstack((clusterX, clusterY)).T, (100, 100))
            xVert = grid[vertices[:, 0]]
            yVert = grid[vertices[:, 1]]
            bPoints = np.vstack((xVert, yVert)).T
            boundaries.append(BoundaryRegion(bPoints, xRange=xRange, yRange=yRange))
        except QhullError:
            print(f"Skipping Cluster {cluster} due to QHull Error (such as geometry degeneracy)")
# fig.savefig("WrinkDetectBinCollapse.png")
snippet.python
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
ax.scatter(x, y, s=1)
for region in boundaries:
    region.plot_region(ax)

png

snippet.python
clusterMembers = {}
for clusterID, region in tqdm(enumerate(boundaries), total=len(boundaries)):
    for point in tqdm(np.vstack([x, y]).T):
        if region.is_point_inside(point):
            if clusterID not in clusterMembers:
                clusterMembers[clusterID] = list()
            clusterMembers[clusterID].append(point)      
  0%|          | 0/20 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]



  0%|          | 0/227933 [00:00<?, ?it/s]
snippet.python
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
ax.scatter(x, y, s=1)
for clusterID, members in clusterMembers.items():
    mem = np.array(members)
    print(mem.shape)
    ax.scatter(mem[:, 0], mem[:, 1], color=f"C{clusterID}")
(2, 2)
(2, 2)
(1, 2)
(3, 2)
(5, 2)
(55772, 2)
(322, 2)
(596, 2)
(33, 2)
(2, 2)
(7, 2)
(20, 2)
(272, 2)
(13170, 2)
(13214, 2)
(18, 2)
(427, 2)
(9624, 2)

png

df

x = df.lz_ref

y = df.sqrtjR_ref

snippet.python
df['cluster'] = -1
 
for clusterIndex, points in tqdm(clusterMembers.items(), total=len(boundaries)):
    for point in tqdm(points, leave=False):
        lzRef, sqrtjRRef = point
        # Find the index of the point in the DataFrame
        mask = (df['lz_ref'] == lzRef) & (df['sqrtjR_ref'] == sqrtjRRef)
        df.loc[mask, 'cluster'] = clusterIndex
  0%|          | 0/20 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/55772 [00:00<?, ?it/s]



  0%|          | 0/322 [00:00<?, ?it/s]



  0%|          | 0/596 [00:00<?, ?it/s]



  0%|          | 0/33 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/20 [00:00<?, ?it/s]



  0%|          | 0/272 [00:00<?, ?it/s]



  0%|          | 0/13170 [00:00<?, ?it/s]



  0%|          | 0/13214 [00:00<?, ?it/s]



  0%|          | 0/18 [00:00<?, ?it/s]



  0%|          | 0/427 [00:00<?, ?it/s]



  0%|          | 0/9624 [00:00<?, ?it/s]
snippet.python
df.to_csv("params_actions_7kms_clustered.csv", index=False)

python