Source code for easydecon.extra

from .easydecon import *


[docs]
def easydecon_workflow(
    sdata,
    markers_df,
    marker_genes=None,                    # This can be a list of genes, You can only give markers_df
    # --- shared / data schema ---
    celltype: str = "group",              # column in markers_df holding cluster IDs
    gene_id_column: str = "names",        # column in markers_df holding gene names
    exclude_group_names: list[str] | None = None,
    bin_size: int = 8,                    # used by both phases and assignment
    # === Phase 1 (priors): common_markers_gene_expression_and_filter ===
    aggregation_method: str = "sum",      # {"sum","mean","median"} supported by your helper funcs
    filtering_algorithm: str = "permutation",  # {"permutation","quantile"}
    num_permutations: int = 5000,         # number of permutations
    parametric: bool = True,              # parametric or empirical quantile
    alpha: float = 0.01,                  # permutation cutoff level
    subsample_size: int = 25000,          # subsample size for permutation
    subsample_signal_quantile: float = 0.1,   #permutation param, between 0 and 1, if 0.1, 10% of the bins with the lowest and highest expression will be discarded
    permutation_gene_pool_fraction: float = 0.3, # top fraction of genes to be used for the null distribution
    n_subs: int = 5,                      # permutation: number of subsamples
    quantile: float = 0.7,                # used only if filtering_algorithm="quantile"
    # === Phase 2 (evidence): get_clusters_by_similarity_on_tissue ===
    method: str = "wjaccard",             # {"wjaccard","cosine","spearman","euclidean","jaccard","overlap", ...}
    similarity_by_column: str = "logfoldchanges",  # 
    lambda_param: float = 0.25,           # lambda parameter wjaccard
    weight_column: str = "logfoldchanges",  # column in markers_df for weights etc.
    # === Proportion estimage: get_proportions_on_tissue ===
    proportion_method: str = "nnls",     # 'nnls', 'ridge', 'elastic'
    normalization_method: str = "unit",  # Options: 'unit', 'zscore',"l1"
    regularization_alpha: float = 0.01,                  # regularization alpha
    l1_ratio: float = 0.7,                # L1/L2 ratio for L1+L2 regularization
    # === Evidence→likelihood mapping (lightweight, non-DL) ===
    evidence_to_likelihood: str = "softmax",  # {"row_normalize","softmax"}
    softmax_tau: float = 1.0,             # softmax temperature
    epsilon: float = 1e-12,               # numerical guard
    # === Final assignment: assign_clusters_from_df ===
    results_column: str = "easydecon",
    assign_method: str = "max",           # {"max","hybrid","zmax"} per your implementation
    allow_multiple: bool = False,
    diagnostic=None,
    fold_change_threshold: float = 2.0,

):

    # -----------------------
    # Phase 1: Priors
    # -----------------------
    phase1_result = common_markers_gene_expression_and_filter(
        sdata=sdata,
        marker_genes=markers_df if marker_genes is None else marker_genes,
        celltype=celltype,
        gene_id_column=gene_id_column,
        exclude_group_names=exclude_group_names,
        bin_size=bin_size,
        aggregation_method=aggregation_method,
        add_to_obs=True if marker_genes is not None else False,
        filtering_algorithm=filtering_algorithm,
        num_permutations=num_permutations,
        alpha=alpha,
        subsample_size=subsample_size,
        subsample_signal_quantile=subsample_signal_quantile,
        permutation_gene_pool_fraction=permutation_gene_pool_fraction,
        n_subs=n_subs,
        quantile=quantile,
        parametric=parametric
    )

    if not isinstance(phase1_result, pd.DataFrame):
        raise TypeError("Phase 1 result must be a pandas DataFrame (spots x clusters).")

    priors_df = phase1_result.copy()
    priors_df = priors_df.clip(lower=0)
    priors_row_sum = priors_df.sum(axis=1).replace(0, np.nan)
    priors_df = priors_df.div(priors_row_sum, axis=0).fillna(0)

    # -----------------------
    # Phase 2: Evidence
    # -----------------------
    phase2_result = get_clusters_by_similarity_on_tissue(
        sdata=sdata,
        markers_df=markers_df,
        bin_size=bin_size,
        gene_id_column=gene_id_column,
        method=method,
        add_to_obs=False,
        common_group_name="MarkerGroup" if isinstance(marker_genes,list) else None,
        similarity_by_column=similarity_by_column,
        weight_column=weight_column,
        lambda_param=lambda_param
    )
    if not isinstance(phase2_result, pd.DataFrame):
        raise TypeError("Phase 2 result must be a pandas DataFrame (spots x clusters).")

    evidence_df = phase2_result.copy()
    if evidence_to_likelihood == "row_normalize":
        min_per_row = evidence_df.min(axis=1)
        needs_shift = (min_per_row < 0)
        if needs_shift.any():
            evidence_df = evidence_df.sub(min_per_row, axis=0)
        evidence_df = evidence_df.clip(lower=0)
        evidence_row_sum = evidence_df.sum(axis=1).replace(0, np.nan)
        likelihoods_df = evidence_df.div(evidence_row_sum, axis=0).fillna(0)

    elif evidence_to_likelihood == "softmax":
        x = evidence_df.to_numpy(dtype=float)
        row_max = np.nanmax(x, axis=1, keepdims=True)
        logits = (x - row_max) / max(softmax_tau, epsilon)
        np.exp(logits, out=logits)
        row_sum = np.sum(logits, axis=1, keepdims=True)
        row_sum[row_sum == 0] = np.nan
        likelihoods_np = logits / row_sum
        likelihoods_np = np.nan_to_num(likelihoods_np, nan=0.0)
        likelihoods_df = pd.DataFrame(likelihoods_np, index=evidence_df.index, columns=evidence_df.columns)

    else:
        raise ValueError("evidence_to_likelihood must be one of {'row_normalize','softmax'}.")

    # -----------------------
    # Posterior combination
    # -----------------------

    if not isinstance(marker_genes,list):
        common_clusters = priors_df.columns.intersection(likelihoods_df.columns)
        if len(common_clusters) == 0:
            raise ValueError("No overlapping cluster columns between Phase 1 and Phase 2 outputs.")
        priors_aligned = priors_df[common_clusters]
        likelihoods_aligned = likelihoods_df[common_clusters]

        common_spots = priors_aligned.index.intersection(likelihoods_aligned.index)
        if len(common_spots) == 0:
            raise ValueError("No overlapping spot/bin indices between Phase 1 and Phase 2 outputs.")
        priors_aligned = priors_aligned.loc[common_spots]
        likelihoods_aligned = likelihoods_aligned.loc[common_spots]

        posterior_unnorm = priors_aligned * likelihoods_aligned
        row_sum = posterior_unnorm.sum(axis=1)
        zero_rows = (row_sum <= epsilon)
        if zero_rows.any():
            posterior_unnorm.loc[zero_rows] = priors_aligned.loc[zero_rows]

        posterior_row_sum = posterior_unnorm.sum(axis=1).replace(0, np.nan)
        posterior_df = posterior_unnorm.div(posterior_row_sum, axis=0).fillna(0)
    else:
        print("Regular workflow, phase 1 used to find most likely postions and phase 2 to assign labels")
        posterior_df = None

    # -----------------------
    # Final assignment
    # -----------------------

    assigned_labels = assign_clusters_from_df(
        sdata,
        df=posterior_df if posterior_df is not None and not isinstance(marker_genes,list) else phase2_result,
        bin_size=bin_size,
        results_column=results_column,
        method=assign_method,
        allow_multiple=allow_multiple,
        diagnostic=diagnostic,
        fold_change_threshold=fold_change_threshold
    )

    try:
        proportions_df= get_proportions_on_tissue(
            sdata,
            markers_df=markers_df,
            bin_size=bin_size,
            add_to_obs=False,
            gene_id_column="names",
            common_group_name="MarkerGroup" if isinstance(marker_genes,list) else None,
            similarity_by_column=similarity_by_column,
            method=proportion_method,
            normalization_method=normalization_method,
            alpha=regularization_alpha,
            l1_ratio=l1_ratio,
            verbose=True
            )
    except:
        print("Proportions could not be estimated, please check if similarity_by_column exists in the data frame...")
        proportions_df = None

    print("Finished!")
    print("Posterior df and proportions can be None if the required columns or input parameters missing...")
    return phase1_result, phase2_result, assigned_labels, posterior_df if posterior_df is not None and not isinstance(marker_genes,list) else phase2_result, proportions_df





[docs]
def get_clusters_expression_on_tissue(sdata,markers_df,common_group_name=None,
                                      bin_size=8,gene_id_column="names",aggregation_method="mean",add_to_obs=True):

    try:
        table = sdata.tables[f"square_00{bin_size}um"]
    except (AttributeError, KeyError):
        table = sdata
        
    markers_df_tmp=markers_df[markers_df[gene_id_column].isin(table.var_names)] #just to be sure the genes are present in the spatial data

    if common_group_name in table.obs.columns:
        print(f"Processing spots with {common_group_name} != 0")
        spots_with_expression = table.obs[table.obs[common_group_name] != 0].index
    else:
        print("common_group_name column not found in the table, processing all spots.")
        spots_with_expression = table.obs.index

    if aggregation_method=="mean":
        compute = lambda x: np.mean(x, axis=1).values
    elif aggregation_method=="median":
        compute = lambda x: np.median(x, axis=1).values
    elif aggregation_method=="sum":
        compute = lambda x: np.sum(x, axis=1).values

    # Preallocate DataFrame with zeros
    all_spots = table.obs.index
    all_clusters = markers_df_tmp.index.unique()
    df = pd.DataFrame(0, index=all_spots, columns=all_clusters)
    #tqdm._instances.clear()
    
    tqdm.pandas()

    # Process only spots with expression
    for spot in tqdm(spots_with_expression, desc='Processing spots',leave=True, position=0):
        a = {}
        for cluster in all_clusters:
            genes = markers_df_tmp.loc[[cluster]][gene_id_column]
            genes = [genes] if isinstance(genes, str) else genes.values
            group_expression = compute(table[spot, genes].to_df())
            a[cluster] = group_expression
        
        # Directly assign to preallocated DataFrame
        df.loc[spot] = pd.DataFrame.from_dict(a, orient='index').transpose().values
    
    if add_to_obs:
        print("Adding results to table.obs of sdata object")
        table.obs.drop(columns=all_clusters,inplace=True,errors='ignore')
        table.obs=pd.merge(table.obs, df, left_index=True, right_index=True)
    
    return df