Astronomical Data Analysis with Advanced Clustering and Fiducial Points Identification

This Python script is designed for the analysis of astronomical data, focusing on the identification of fiducial points across main sequences and sub-giant branches in stellar populations. It employs various scientific libraries to process and analyze color-magnitude data of stars, using methods such as hierarchical clustering (HDBSCAN), KMeans clustering, curve fitting, and smoothing techniques (Savitzky-Golay filter, B-splines) to identify and refine fiducial sequences. The script is intended for advanced users with a background in astrophysics or astronomy, conducting research that requires precise characterization of stellar populations.

import matplotlib.pyplot as plt
import pickle as pkl
import numpy as np
from scipy.optimize import minimize
from scipy.interpolate import interp1d
from functools import lru_cache, wraps
from scipy.spatial.distance import cdist
import hdbscan
from scipy.optimize import curve_fit
from sklearn.cluster import KMeans
from scipy.signal import savgol_filter
from scipy.interpolate import splrep, BSpline
 
 
def two_point(r0, ri, n=50):
    distance = cdist(r0.reshape(1,2), ri)[0]
    nSmallestDistances = distance[np.argpartition(distance, n)[:n]]
    return np.median(nSmallestDistances)
 
 
 
def iter_cut(color, mag, n=100, m=1.5):
    # presort points based on their angle off the horizontal
    # this is to try to push the distance partitionaing to a better
    # case senario
    theta = np.arctan(mag/color)
    sortedTheta = theta.argsort()
    color = color[sortedTheta]
    mag = mag[sortedTheta]
 
    ri = np.vstack((color, mag)).T
    c = np.array([two_point(x,ri,n=n) for x in ri])
    stdC = np.std(c)
    meanC = np.median(c)
    c2 = cdist(c.reshape(c.shape[0],1), np.array([meanC]).reshape(1,1)).flatten()/stdC
 
    color = color[c2 != 0]
    mag = mag[c2 != 0]
    c2b = c2.copy()
    c2 = c2[c2 != 0]
 
    condition = np.log10(c2) < m*np.median(np.log10(c2))#-1.5
    colorCut = color[condition]
    magCut = mag[condition]
    c2Cut = c2[condition]
    return {
        "color": colorCut,
        "mag": magCut,
        "sigDistCut": c2Cut,
        "corr": c,
        "log10_sigDist": np.log10(c2b)
    }
 
def smooth_sequence(color, mag, s=1):
    sortedMag = mag.argsort()
    sC = color[sortedMag]
    sM = mag[sortedMag]
    tck = splrep(sM, sC, s=s)
    f = BSpline(*tck)
 
    return f(mag), f
 
def bifricated_MS_fiducials(color, mag, binSize=0.1):
    magBins = np.arange(mag.min(), mag.max()+binSize, binSize)
    fiducialPoints = np.empty(shape=(2,magBins.shape[0]-1,2))
    for idx, (bb,bt) in enumerate(zip(magBins[:-1], magBins[1:])):
        cMag = mag[(mag >= bb) & (mag < bt)]
        cCol = color[(mag >= bb) & (mag < bt)]
        points = np.vstack((cCol, cMag)).T
        clusters = KMeans(n_clusters=2,random_state=0,n_init='auto').fit(points)
        LRdet = 0 if np.mean(points[clusters.labels_ == 0][:,0]) < np.mean(points[clusters.labels_ == 1][:,0]) else 1
        group1Points = points[clusters.labels_ == LRdet]
        group2Points = points[clusters.labels_ == int(not LRdet)]
 
        fiducialPoints[0,idx,0] = np.mean(group1Points[:,0])
        fiducialPoints[0,idx,1] = np.mean(group1Points[:,1])
        fiducialPoints[1,idx,0] = np.mean(group2Points[:,0])
        fiducialPoints[1,idx,1] = np.mean(group2Points[:,1])
    return fiducialPoints
 
def sub_MSTO_fiducial(color, mag, MSFiducials, binSize=0.1):
    gaus = lambda x, a, b, c: a*np.exp(-(x-b)**2/(2*c**2))
    belowMSColor = color[mag > MSFiducials[:,:,1].max()]
    belowMSMag = mag[mag > MSFiducials[:,:,1].max()]
 
    magRange = (np.percentile(belowMSMag,5), np.percentile(belowMSMag,95))
    belowMSColor = belowMSColor[(belowMSMag >= magRange[0]) & (belowMSMag < magRange[1])]
    belowMSMag = belowMSMag[(belowMSMag >= magRange[0]) & (belowMSMag < magRange[1])]
 
    subMSMags = np.arange(belowMSMag.min(), belowMSMag.max()+binSize, binSize)
    fiducialPoints = np.empty(shape=(subMSMags.shape[0]-1,2))
    for idx, (bb, bt) in enumerate(zip(subMSMags[:-1], subMSMags[1:])):
        cC = belowMSColor[(belowMSMag >= bb) * (belowMSMag < bt)]
        hist,bins = np.histogram(cC, density=True,bins='fd')
        centers = (bins[1:] + bins[:-1])/2
        try:
            fit, covar = curve_fit(gaus, centers, hist)
            fiducialColor = fit[1]
        except RuntimeError:
            print("FAIL")
            fiducialColor = np.median(cC)
        fiducialPoints[idx, 0] = fiducialColor
        fiducialPoints[idx, 1] = (bb+bt)/2
    return fiducialPoints
 
def bifricated_super_MSTO_fiducial(color, mag, MSFiducials, binSize=0.1):
    aboveMSColor = color[mag < MSFiducials[:,:,1].min()]
    aboveMSMag = mag[mag < MSFiducials[:,:,1].min()]
 
    magRange = (np.percentile(aboveMSMag,5), np.percentile(aboveMSMag,95))
    colorRange = (np.percentile(aboveMSColor,5), np.percentile(aboveMSColor,95))
 
    aboveMSColor = aboveMSColor[(aboveMSMag >= magRange[0]) & (aboveMSMag < magRange[1])]
    aboveMSMag = aboveMSMag[(aboveMSMag >= magRange[0]) & (aboveMSMag < magRange[1])]
 
    aboveMSMag = aboveMSMag[(aboveMSColor >= colorRange[0]) & (aboveMSColor < colorRange[1])]
    aboveMSColor = aboveMSColor[(aboveMSColor >= colorRange[0]) & (aboveMSColor < colorRange[1])]
 
    r = iter_cut(aboveMSColor.values, aboveMSMag.values, n=50,m=0)
    superMSMagBins = np.arange(r["mag"].min(), r["mag"].max()+binSize, binSize)
    fiducialPoints = np.empty(shape=(2,superMSMagBins.shape[0]-1,2))
    for idx, (bb, bt) in enumerate(zip(superMSMagBins[:-1], superMSMagBins[1:])):
        cC = r["color"][(r["mag"] >= bb) & (r["mag"] < bt)]
        cM = r["mag"][(r["mag"] >= bb) & (r["mag"]< bt)]
        colorRange = (np.percentile(cC,5), np.percentile(cC,95))
        cM = cM[(cC >= colorRange[0]) & (cC < colorRange[1])]
        cC = cC[(cC >= colorRange[0]) & (cC < colorRange[1])]
 
        points = np.vstack((cC, cM)).T
        clusters = KMeans(n_clusters=2, random_state=0,n_init='auto').fit(points)
        LRdet = 0 if np.mean(points[clusters.labels_ == 0][:,0]) < np.mean(points[clusters.labels_ == 1][:,0]) else 1
        group1Points = points[clusters.labels_ == LRdet]
        group2Points = points[clusters.labels_ == int(not LRdet)]
 
        fiducialPoints[0,idx,0] = np.mean(group1Points[:,0])
        fiducialPoints[0,idx,1] = np.mean(group1Points[:,1])
        fiducialPoints[1,idx,0] = np.mean(group2Points[:,0])
        fiducialPoints[1,idx,1] = np.mean(group2Points[:,1])
 
    return fiducialPoints
 
def sequence_mid_point(A, B, n=3,s=0.5):
    AltB = True if A[:,1].min() > B[:,1].max() else False
    sortedA = np.sort(A.copy())
    sortedB = np.sort(B.copy())
 
    if AltB:
        Ap = A[:n]
        Bp = B[-n:]
    else:
        Ap = A[-n:]
        Bp = B[:n]
 
    fiducialWindow = np.vstack((Ap,Bp))
    fiducialWindow = fiducialWindow[fiducialWindow[:,0].argsort()]
 
    _, fSmooth = smooth_sequence(fiducialWindow[:,0], fiducialWindow[:,1],s=s)
 
    if AltB:
        A[:n, 0] = fSmooth(A[:n, 1])
        B[-n:, 0] = fSmooth(B[-n:, 1])
    else:
        A[-n:, 0] = fSmooth(A[-n:, 1])
        B[:n, 0] = fSmooth(B[:n, 1])
    return A, B
 
def fiducial_points(color, mag):
    MSTOR = iter_cut(color, mag, m=1.5)
    mainSequences = np.vstack((MSTOR["color"], MSTOR["mag"])).T
 
    MSFiducials = bifricated_MS_fiducials(mainSequences[:,0], mainSequences[:,1])
 
    subMSFiducialsRoot = sub_MSTO_fiducial(color,mag,MSFiducials,binSize=0.1)
    subMSFiducials = np.empty(shape=(2,subMSFiducialsRoot.shape[0],2))
 
    subMSFiducials[0] = subMSFiducialsRoot
    subMSFiducials[1] = subMSFiducialsRoot
 
    superMSFiducials = bifricated_super_MSTO_fiducial(color, mag, MSFiducials)
 
    for sequenceID, (MS, RGB) in enumerate(zip(MSFiducials, superMSFiducials)):
        MSsmooth, RGBsmooth = sequence_mid_point(MS, RGB, n=4,s=1)
        MSFiducials[sequenceID] = MSsmooth
        superMSFiducials[sequenceID] = RGBsmooth
 
    for sequenceID, sequence in enumerate(superMSFiducials):
        superMSSmoothColor, _ = smooth_sequence(sequence[:, 0], sequence[:, 1], s=0.1)
        superMSFiducials[sequenceID, :, 0] = superMSSmoothColor
 
    for sequenceID, (PMS, MS) in enumerate(zip(subMSFiducials, MSFiducials)):
        PMSsmooth, MSsmooth = sequence_mid_point(PMS, MS,n=4,s=1)
        MSFiducials[sequenceID] = MS
        subMSFiducials[sequenceID] = PMS
 
    for sequenceID, sequence in enumerate(MSFiducials):
        MSSmoothColor, _ = smooth_sequence(sequence[:, 0], sequence[:, 1], s=0)
        MSFiducials[sequenceID, :, 0] = MSSmoothColor
 
    tfp = MSFiducials.shape[1] + subMSFiducials.shape[1] + superMSFiducials.shape[1]
    fiducialPoints = np.empty(
        shape=(
            MSFiducials.shape[0],
            tfp,
            2
        )
    )
 
 
    for sequenceID, _ in enumerate(fiducialPoints):
        fiducialPoints[sequenceID] = np.vstack(
            (subMSFiducials[sequenceID],MSFiducials[sequenceID],superMSFiducials[sequenceID])
        )
 
    return fiducialPoints
 
def fiducial_functions(color, mag, s=0):
    fiducialPoints = fiducial_points(color, mag)
    functions = list()
    for sequenceID, sequence in enumerate(fiducialPoints):
        functions.append(interp1d(sequence[:,1], sequence[:,0], bounds_error=False, fill_value="extrapolate"))
    return {i+1: f for i, f in enumerate(functions)}