Source code for easydecon.extra

from .easydecon import *

[docs] def easydecon_workflow( sdata, markers_df, marker_genes=None, # This can be a list of genes, You can only give markers_df # --- shared / data schema --- celltype: str = "group", # column in markers_df holding cluster IDs gene_id_column: str = "names", # column in markers_df holding gene names exclude_group_names: list[str] | None = None, bin_size: int = 8, # used by both phases and assignment # === Phase 1 (priors): common_markers_gene_expression_and_filter === aggregation_method: str = "sum", # {"sum","mean","median"} supported by your helper funcs filtering_algorithm: str = "permutation", # {"permutation","quantile"} num_permutations: int = 5000, # number of permutations parametric: bool = True, # parametric or empirical quantile alpha: float = 0.01, # permutation cutoff level subsample_size: int = 25000, # subsample size for permutation subsample_signal_quantile: float = 0.1, #permutation param, between 0 and 1, if 0.1, 10% of the bins with the lowest and highest expression will be discarded permutation_gene_pool_fraction: float = 0.3, # top fraction of genes to be used for the null distribution n_subs: int = 5, # permutation: number of subsamples quantile: float = 0.7, # used only if filtering_algorithm="quantile" # === Phase 2 (evidence): get_clusters_by_similarity_on_tissue === method: str = "wjaccard", # {"wjaccard","cosine","spearman","euclidean","jaccard","overlap", ...} similarity_by_column: str = "logfoldchanges", # lambda_param: float = 0.25, # lambda parameter wjaccard weight_column: str = "logfoldchanges", # column in markers_df for weights etc. # === Proportion estimage: get_proportions_on_tissue === proportion_method: str = "nnls", # 'nnls', 'ridge', 'elastic' normalization_method: str = "unit", # Options: 'unit', 'zscore',"l1" regularization_alpha: float = 0.01, # regularization alpha l1_ratio: float = 0.7, # L1/L2 ratio for L1+L2 regularization # === Evidence→likelihood mapping (lightweight, non-DL) === evidence_to_likelihood: str = "softmax", # {"row_normalize","softmax"} softmax_tau: float = 1.0, # softmax temperature epsilon: float = 1e-12, # numerical guard # === Final assignment: assign_clusters_from_df === results_column: str = "easydecon", assign_method: str = "max", # {"max","hybrid","zmax"} per your implementation allow_multiple: bool = False, diagnostic=None, fold_change_threshold: float = 2.0, ): # ----------------------- # Phase 1: Priors # ----------------------- phase1_result = common_markers_gene_expression_and_filter( sdata=sdata, marker_genes=markers_df if marker_genes is None else marker_genes, celltype=celltype, gene_id_column=gene_id_column, exclude_group_names=exclude_group_names, bin_size=bin_size, aggregation_method=aggregation_method, add_to_obs=True if marker_genes is not None else False, filtering_algorithm=filtering_algorithm, num_permutations=num_permutations, alpha=alpha, subsample_size=subsample_size, subsample_signal_quantile=subsample_signal_quantile, permutation_gene_pool_fraction=permutation_gene_pool_fraction, n_subs=n_subs, quantile=quantile, parametric=parametric ) if not isinstance(phase1_result, pd.DataFrame): raise TypeError("Phase 1 result must be a pandas DataFrame (spots x clusters).") priors_df = phase1_result.copy() priors_df = priors_df.clip(lower=0) priors_row_sum = priors_df.sum(axis=1).replace(0, np.nan) priors_df = priors_df.div(priors_row_sum, axis=0).fillna(0) # ----------------------- # Phase 2: Evidence # ----------------------- phase2_result = get_clusters_by_similarity_on_tissue( sdata=sdata, markers_df=markers_df, bin_size=bin_size, gene_id_column=gene_id_column, method=method, add_to_obs=False, common_group_name="MarkerGroup" if isinstance(marker_genes,list) else None, similarity_by_column=similarity_by_column, weight_column=weight_column, lambda_param=lambda_param ) if not isinstance(phase2_result, pd.DataFrame): raise TypeError("Phase 2 result must be a pandas DataFrame (spots x clusters).") evidence_df = phase2_result.copy() if evidence_to_likelihood == "row_normalize": min_per_row = evidence_df.min(axis=1) needs_shift = (min_per_row < 0) if needs_shift.any(): evidence_df = evidence_df.sub(min_per_row, axis=0) evidence_df = evidence_df.clip(lower=0) evidence_row_sum = evidence_df.sum(axis=1).replace(0, np.nan) likelihoods_df = evidence_df.div(evidence_row_sum, axis=0).fillna(0) elif evidence_to_likelihood == "softmax": x = evidence_df.to_numpy(dtype=float) row_max = np.nanmax(x, axis=1, keepdims=True) logits = (x - row_max) / max(softmax_tau, epsilon) np.exp(logits, out=logits) row_sum = np.sum(logits, axis=1, keepdims=True) row_sum[row_sum == 0] = np.nan likelihoods_np = logits / row_sum likelihoods_np = np.nan_to_num(likelihoods_np, nan=0.0) likelihoods_df = pd.DataFrame(likelihoods_np, index=evidence_df.index, columns=evidence_df.columns) else: raise ValueError("evidence_to_likelihood must be one of {'row_normalize','softmax'}.") # ----------------------- # Posterior combination # ----------------------- if not isinstance(marker_genes,list): common_clusters = priors_df.columns.intersection(likelihoods_df.columns) if len(common_clusters) == 0: raise ValueError("No overlapping cluster columns between Phase 1 and Phase 2 outputs.") priors_aligned = priors_df[common_clusters] likelihoods_aligned = likelihoods_df[common_clusters] common_spots = priors_aligned.index.intersection(likelihoods_aligned.index) if len(common_spots) == 0: raise ValueError("No overlapping spot/bin indices between Phase 1 and Phase 2 outputs.") priors_aligned = priors_aligned.loc[common_spots] likelihoods_aligned = likelihoods_aligned.loc[common_spots] posterior_unnorm = priors_aligned * likelihoods_aligned row_sum = posterior_unnorm.sum(axis=1) zero_rows = (row_sum <= epsilon) if zero_rows.any(): posterior_unnorm.loc[zero_rows] = priors_aligned.loc[zero_rows] posterior_row_sum = posterior_unnorm.sum(axis=1).replace(0, np.nan) posterior_df = posterior_unnorm.div(posterior_row_sum, axis=0).fillna(0) else: print("Regular workflow, phase 1 used to find most likely postions and phase 2 to assign labels") posterior_df = None # ----------------------- # Final assignment # ----------------------- assigned_labels = assign_clusters_from_df( sdata, df=posterior_df if posterior_df is not None and not isinstance(marker_genes,list) else phase2_result, bin_size=bin_size, results_column=results_column, method=assign_method, allow_multiple=allow_multiple, diagnostic=diagnostic, fold_change_threshold=fold_change_threshold ) try: proportions_df= get_proportions_on_tissue( sdata, markers_df=markers_df, bin_size=bin_size, add_to_obs=False, gene_id_column="names", common_group_name="MarkerGroup" if isinstance(marker_genes,list) else None, similarity_by_column=similarity_by_column, method=proportion_method, normalization_method=normalization_method, alpha=regularization_alpha, l1_ratio=l1_ratio, verbose=True ) except: print("Proportions could not be estimated, please check if similarity_by_column exists in the data frame...") proportions_df = None print("Finished!") print("Posterior df and proportions can be None if the required columns or input parameters missing...") return phase1_result, phase2_result, assigned_labels, posterior_df if posterior_df is not None and not isinstance(marker_genes,list) else phase2_result, proportions_df
[docs] def get_clusters_expression_on_tissue(sdata,markers_df,common_group_name=None, bin_size=8,gene_id_column="names",aggregation_method="mean",add_to_obs=True): try: table = sdata.tables[f"square_00{bin_size}um"] except (AttributeError, KeyError): table = sdata markers_df_tmp=markers_df[markers_df[gene_id_column].isin(table.var_names)] #just to be sure the genes are present in the spatial data if common_group_name in table.obs.columns: print(f"Processing spots with {common_group_name} != 0") spots_with_expression = table.obs[table.obs[common_group_name] != 0].index else: print("common_group_name column not found in the table, processing all spots.") spots_with_expression = table.obs.index if aggregation_method=="mean": compute = lambda x: np.mean(x, axis=1).values elif aggregation_method=="median": compute = lambda x: np.median(x, axis=1).values elif aggregation_method=="sum": compute = lambda x: np.sum(x, axis=1).values # Preallocate DataFrame with zeros all_spots = table.obs.index all_clusters = markers_df_tmp.index.unique() df = pd.DataFrame(0, index=all_spots, columns=all_clusters) #tqdm._instances.clear() tqdm.pandas() # Process only spots with expression for spot in tqdm(spots_with_expression, desc='Processing spots',leave=True, position=0): a = {} for cluster in all_clusters: genes = markers_df_tmp.loc[[cluster]][gene_id_column] genes = [genes] if isinstance(genes, str) else genes.values group_expression = compute(table[spot, genes].to_df()) a[cluster] = group_expression # Directly assign to preallocated DataFrame df.loc[spot] = pd.DataFrame.from_dict(a, orient='index').transpose().values if add_to_obs: print("Adding results to table.obs of sdata object") table.obs.drop(columns=all_clusters,inplace=True,errors='ignore') table.obs=pd.merge(table.obs, df, left_index=True, right_index=True) return df