Skip to content

API Reference

Top-level package for Isoform Perplexity.

call_effective(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL)

Identify effective features per gene based on gene perplexity.

Parameters

df : pd.DataFrame DataFrame containing gene, feature, 'pi', and 'perplexity' columns. gene_col : str Column representing the gene (default: 'gene_id') feature_col : str Column representing the feature (default: 'transcript_id')

Returns

pd.DataFrame DataFrame with additional columns: - 'n_effective': perplexity rounded to nearest integer - 'feature_rank': rank of each feature within its gene (by pi) - 'effective': boolean indicating if feature is effective

Source code in src/isoplex/utils.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def call_effective(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL):
    """
    Identify effective features per gene based on gene perplexity.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing gene, feature, 'pi', and 'perplexity' columns.
    gene_col : str
        Column representing the gene (default: 'gene_id')
    feature_col : str
        Column representing the feature (default: 'transcript_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with additional columns:
        - 'n_effective': perplexity rounded to nearest integer
        - 'feature_rank': rank of each feature within its gene (by pi)
        - 'effective': boolean indicating if feature is effective
    """
    # round perplexity to nearest integer
    df['n_effective'] = df['perplexity'].round(0)

    # rank features within each gene by pi
    df['feature_rank'] = (
        df.groupby(gene_col)['pi']
        .rank(method='first', ascending=False)
        .astype(int)
    )

    # mark features as effective if rank <= rounded perplexity
    df['effective'] = df['feature_rank'] <= df['n_effective']

    return df

collapse_counts_by_feature(df, feature_col=TRANSCRIPT_COL, expression_type=EXP_COL, gene_col=GENE_COL, sample_col=None)

Collapse counts by a feature (e.g. ORF, transcript) instead of transcript.

Parameters

df : pd.DataFrame Input table with counts at transcript level. feature_col : str Alternative feature column to collapse to (e.g. 'orf_id'). expression_type : str Name of expression col to collapse gene_col : str Column identifying genes. sample_col : str, optional Sample column. If None, assumes single-sample bulk.

Returns

pd.DataFrame Collapsed df with summed counts per feature.

Source code in src/isoplex/utils.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def collapse_counts_by_feature(df,
                               feature_col=TRANSCRIPT_COL,
                               expression_type=EXP_COL,
                               gene_col=GENE_COL,
                               sample_col=None):
    """
    Collapse counts by a feature (e.g. ORF, transcript) instead of transcript.

    Parameters
    ----------
    df : pd.DataFrame
        Input table with counts at transcript level.
    feature_col : str
        Alternative feature column to collapse to (e.g. 'orf_id').
    expression_type : str
        Name of expression col to collapse
    gene_col : str
        Column identifying genes.
    sample_col : str, optional
        Sample column. If None, assumes single-sample bulk.

    Returns
    -------
    pd.DataFrame
        Collapsed df with summed counts per feature.
    """
    group_cols = list(dict.fromkeys([gene_col, feature_col]))

    if sample_col is not None:
        group_cols.append(sample_col)

    # sum counts for all transcripts mapping to the same feature
    out = (df.groupby(group_cols, as_index=False)[expression_type]
          .sum())

    return out

compute_avg_expression(df, sample_col, feature_col=TRANSCRIPT_COL)

Compute average expression across samples for each feature. Only considers samples where feature is expressed!

Parameters

df : pd.DataFrame DataFrame containing feature, sample, and counts columns. sample_col : str Column representing sample IDs. feature_col : str Column representing the feature (default: 'transcript_id')

Returns

pd.DataFrame DataFrame with additional column: - 'avg__tpm': mean tpm across samples for the feature

Source code in src/isoplex/utils.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
def compute_avg_expression(df, sample_col, feature_col=TRANSCRIPT_COL):
    """
    Compute average expression across samples for each feature.
    Only considers samples where feature is expressed!

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing feature, sample, and counts columns.
    sample_col : str
        Column representing sample IDs.
    feature_col : str
        Column representing the feature (default: 'transcript_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with additional column:
        - 'avg_<feature_col>_tpm': mean tpm across samples for the feature
    """
    temp = df[[feature_col, sample_col, 'tpm']]

    # filter out unexpressed features so mean denominator is correct
    temp = temp.loc[temp['tpm'] > 0]

    # sum counts per feature per sample (handles cases with multiple rows per feature)
    temp = (
        temp.groupby([feature_col, sample_col])
        .sum()
        .reset_index()
        .rename({'tpm': f'{feature_col}_tpm'}, axis=1)
    )
    temp.drop(sample_col, axis=1, inplace=True)

    # then take mean across samples
    temp = (
        temp.groupby(feature_col)
        .mean()
        .reset_index()
        .rename({f'{feature_col}_tpm': f'avg_{feature_col}_tpm'}, axis=1)
    )

    # merge back to original df
    df = df.merge(temp, how='left', on=[feature_col])

    return df

compute_entropy(df, gene_col=GENE_COL)

Compute Shannon entropy per gene based on isoform proportions.

Parameters

df : pd.DataFrame DataFrame containing gene and 'pi' columns. gene_col : str Column representing the gene (default: 'gene_id')

Returns

pd.DataFrame DataFrame with an additional column: - 'entropy': Shannon entropy per gene

Source code in src/isoplex/utils.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def compute_entropy(df, gene_col=GENE_COL):
    """
    Compute Shannon entropy per gene based on isoform proportions.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing gene and 'pi' columns.
    gene_col : str
        Column representing the gene (default: 'gene_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with an additional column:
        - 'entropy': Shannon entropy per gene
    """
    # compute plogp for each feature; avoid log2(0)
    df['plogp'] = df['pi'] * np.log2(df['pi'].replace(0, np.nan))

    # sum plogp per gene
    entropy_df = (
        df[[gene_col, 'plogp']]
        .groupby(gene_col)
        .sum()
        .reset_index()
        .rename({'plogp': 'entropy'}, axis=1)
    )

    # multiply by -1 for positive entropy
    entropy_df['entropy'] = -1 * entropy_df['entropy']

    # merge back to original df
    df = df.merge(entropy_df, how='left', on=gene_col)

    # drop intermediate column
    df = df.drop(columns='plogp')

    return df

compute_expression_breadth(df, sample_col, feature_col=TRANSCRIPT_COL)

Compute percentage of samples in which each feature is effective.

Parameters

df : pd.DataFrame DataFrame containing feature, sample, and 'effective' columns. sample_col : str Column representing sample IDs. feature_col : str Column representing the feature (default: 'transcript_id')

Returns

pd.DataFrame DataFrame with additional columns: - 'n_samples_effective': number of samples where the feature is effective - 'expression_breadth': percentage of samples where the feature is effective

Source code in src/isoplex/utils.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def compute_expression_breadth(df, sample_col, feature_col=TRANSCRIPT_COL):
    """
    Compute percentage of samples in which each feature is effective.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing feature, sample, and 'effective' columns.
    sample_col : str
        Column representing sample IDs.
    feature_col : str
        Column representing the feature (default: 'transcript_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with additional columns:
        - 'n_samples_effective': number of samples where the feature is effective
        - 'expression_breadth': percentage of samples where the feature is effective
    """
    n_samples = df[sample_col].nunique()

    temp = (
        df.loc[df['effective'], [feature_col, sample_col]]
        .groupby(feature_col)
        .nunique()
        .reset_index()
        .rename({sample_col: 'n_samples_effective'}, axis=1)
    )

    df = df.merge(temp, how='left', on=feature_col)
    df['expression_breadth'] = df['n_samples_effective'].fillna(0) / n_samples * 100

    return df

compute_expression_var(df, sample_col, feature_col=TRANSCRIPT_COL)

Compute number of samples expressing each feature and pi standard deviation.

Parameters

df : pd.DataFrame DataFrame containing feature, sample, and 'pi' columns. sample_col : str Column representing sample IDs. feature_col : str Column representing the feature (default: 'transcript_id')

Returns

pd.DataFrame DataFrame with additional columns: - 'n_exp_samples': number of samples where feature is expressed - 'expression_var': standard deviation of pi across samples

Source code in src/isoplex/utils.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
def compute_expression_var(df, sample_col, feature_col=TRANSCRIPT_COL):
    """
    Compute number of samples expressing each feature and pi standard deviation.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing feature, sample, and 'pi' columns.
    sample_col : str
        Column representing sample IDs.
    feature_col : str
        Column representing the feature (default: 'transcript_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with additional columns:
        - 'n_exp_samples': number of samples where feature is expressed
        - 'expression_var': standard deviation of pi across samples
    """
    # number of samples where feature is expressed
    df['n_exp_samples'] = df.groupby(feature_col)[sample_col].transform('nunique')

    # standard deviation of pi across samples
    df['expression_var'] = df.groupby(feature_col)['pi'].transform(lambda x: x.std(ddof=1, skipna=True))

    return df

compute_global_isoform_metrics(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL, expression_type='counts')

Compute isoform or other feature diversity metrics for a single-sample (bulk) dataframe. Either provide counts or TPMs; if counts, will automatically convert to TPM. Optionally, collapse counts to a different feature or compute TPM.

Parameters

df : pd.DataFrame DataFrame containing counts at transcript level. gene_col : str Column representing the gene (default: 'gene_id') feature_col : str Column representing the feature (default: 'transcript_id') expression_type : str Type of expression values in table {'counts' | 'tpm'}

Returns

pd.DataFrame DataFrame with computed metrics.

Source code in src/isoplex/utils.py
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
def compute_global_isoform_metrics(df,
                                   gene_col=GENE_COL,
                                   feature_col=TRANSCRIPT_COL,
                                   expression_type='counts'):
    """
    Compute isoform or other feature diversity metrics for a single-sample (bulk) dataframe.
    Either provide counts or TPMs; if counts, will automatically convert to TPM.
    Optionally, collapse counts to a different feature or compute TPM.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing counts at transcript level.
    gene_col : str
        Column representing the gene (default: 'gene_id')
    feature_col : str
        Column representing the feature (default: 'transcript_id')
    expression_type : str
        Type of expression values in table {'counts' | 'tpm'}

    Returns
    -------
    pd.DataFrame
        DataFrame with computed metrics.
    """
    # validate input
    validate_counts_input(df,
                          gene_col=gene_col,
                          feature_col=feature_col)

    # convert from wide to long
    df = rename_sample_col(df,
                      gene_col=gene_col,
                      feature_col=feature_col,
                      expression_type=expression_type)

    # collapse counts if feature_col is not unique
    df = collapse_counts_by_feature(df,
                                    feature_col=feature_col,
                                    expression_type=expression_type,
                                    gene_col=gene_col,
                                    sample_col=None)

    # compute TPM if required
    df = df if expression_type == 'tpm' else compute_tpm(df)

    # compute isoform ratios
    df = compute_pi(df)

    # compute gene-level metrics
    df = compute_n_detected_features(df, gene_col=gene_col, feature_col=feature_col)
    df = compute_entropy(df, gene_col=gene_col)
    df = compute_perplexity(df)

    # mark effective features
    df = call_effective(df, gene_col=gene_col, feature_col=feature_col)

    return df

compute_max_expression(df, sample_col, feature_col=TRANSCRIPT_COL)

Compute maximum expression across samples for each feature. Only considers samples where feature is expressed!

Parameters

df : pd.DataFrame DataFrame containing feature, sample, and counts columns. sample_col : str Column representing sample IDs. feature_col : str Column representing the feature (default: 'transcript_id')

Returns

pd.DataFrame DataFrame with additional column: - 'avg__tpm': mean tpm across samples for the feature

Source code in src/isoplex/utils.py
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
def compute_max_expression(df, sample_col, feature_col=TRANSCRIPT_COL):
    """
    Compute maximum expression across samples for each feature.
    Only considers samples where feature is expressed!

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing feature, sample, and counts columns.
    sample_col : str
        Column representing sample IDs.
    feature_col : str
        Column representing the feature (default: 'transcript_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with additional column:
        - 'avg_<feature_col>_tpm': mean tpm across samples for the feature
    """
    temp = df[[feature_col, sample_col, 'tpm']]

    # filter out unexpressed features so mean denominator is correct
    temp = temp.loc[temp['tpm'] > 0]

    # sum counts per feature per sample (handles cases with multiple rows per feature)
    temp = (
        temp.groupby([feature_col, sample_col])
        .sum()
        .reset_index()
        .rename({'tpm': f'{feature_col}_tpm'}, axis=1)
    )
    temp.drop(sample_col, axis=1, inplace=True)

    # then take mean across samples
    temp = (
        temp.groupby(feature_col)
        .max()
        .reset_index()
        .rename({f'{feature_col}_tpm': f'max_{feature_col}_tpm'}, axis=1)
    )

    # merge back to original df
    df = df.merge(temp, how='left', on=[feature_col])

    return df

compute_multi_sample_isoform_metrics(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL, expression_type=EXP_COL)

Compute isoform metrics across multiple samples as well as global metrics.

Parameters

df : pd.DataFrame Input table with counts or TPMs for all samples. gene_col : str Column identifying genes. feature_col : str Column identifying isoforms or other features (e.g. ORFs). expression_type : {'counts', 'tpm'} Type of expression values

Returns

big_df : pd.DataFrame DataFrame with: • per-sample metrics (gene potential, entropy, etc.) • cross-sample metrics (breadth, variance, average expression) global_df : pd.DataFrame DataFrame with: • global entropy, detected features, and perplexity per gene, based on summing tpms across all samples

Source code in src/isoplex/utils.py
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
def compute_multi_sample_isoform_metrics(
    df: pd.DataFrame,
    gene_col: str = GENE_COL,
    feature_col: str = TRANSCRIPT_COL,
    expression_type: str = EXP_COL):
    """
    Compute isoform metrics across multiple samples as well as global
    metrics.

    Parameters
    ----------
    df : pd.DataFrame
        Input table with counts or TPMs for all samples.
    gene_col : str
        Column identifying genes.
    feature_col : str
        Column identifying isoforms or other features (e.g. ORFs).
    expression_type : {'counts', 'tpm'}
        Type of expression values

    Returns
    -------
    big_df : pd.DataFrame
        DataFrame with:
          • per-sample metrics (gene potential, entropy, etc.)
          • cross-sample metrics (breadth, variance, average expression)
    global_df : pd.DataFrame
        DataFrame with:
          • global entropy, detected features, and perplexity per gene,
            based on summing tpms across all samples
    """

    # validate input
    validate_counts_input(df,
                          gene_col=gene_col,
                          feature_col=feature_col)

    # prepare kwargs for downstream functions
    col_kwargs = dict(
        gene_col=gene_col,
        feature_col=feature_col,
        expression_type=expression_type,
    )

    # loop over samples and compute single-sample metrics
    samples = [c for c in df.columns if c not in [gene_col, feature_col]]
    dfs = []

    for s in samples:
        s_df = df[[gene_col, feature_col, s]].copy()
        s_df = compute_global_isoform_metrics(s_df,
                                              gene_col=gene_col,
                                              feature_col=feature_col,
                                              expression_type=expression_type)
        s_df['sample'] = s  # re-attach sample ID
        dfs.append(s_df)

    big_df = pd.concat(dfs, axis=0, ignore_index=True)

    # compute cross-sample metrics on the combined table
    big_df = compute_expression_breadth(big_df,
                                        sample_col='sample',
                                        feature_col=feature_col)
    big_df = compute_expression_var(big_df,
                                    sample_col='sample',
                                    feature_col=feature_col)

    for fc in [feature_col, gene_col]:
        big_df = compute_avg_expression(big_df,
                                        sample_col='sample',
                                        feature_col=fc)
        big_df = compute_max_expression(big_df,
                                        sample_col='sample',
                                        feature_col=fc)

    # sum up tpms across samples
    global_df = (big_df[[gene_col, feature_col, 'tpm']]
          .groupby([gene_col, feature_col])
          .sum()
          .reset_index()
         )

    global_df.rename({'tpm': 'global'}, axis=1, inplace=True)



    global_df = compute_global_isoform_metrics(global_df,
                                               gene_col=gene_col,
                                               feature_col=feature_col,
                                               expression_type='tpm')

    return big_df, global_df

compute_n_detected_features(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL)

Compute gene potential based on the number of unique expressed features per gene.

Parameters

df : pd.DataFrame DataFrame containing gene and feature columns. gene_col : str Column representing the gene (default: 'gene_id') feature_col : str Column representing the feature (default: 'transcript_id')

Returns

pd.DataFrame DataFrame with an additional column: - 'n_detected_features': number of unique features per gene

Source code in src/isoplex/utils.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def compute_n_detected_features(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL):
    """
    Compute gene potential based on the number of unique expressed features per gene.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing gene and feature columns.
    gene_col : str
        Column representing the gene (default: 'gene_id')
    feature_col : str
        Column representing the feature (default: 'transcript_id')

    Returns
    -------
    pd.DataFrame
        DataFrame with an additional column:
        - 'n_detected_features': number of unique features per gene
    """
    # count unique expressed features per gene
    temp = df.loc[df.tpm>0].copy(deep=True)
    n_detected_features_df = (
        temp[[gene_col, feature_col]]
        .groupby(gene_col)
        .nunique()
        .reset_index()
        .rename({feature_col: 'n_detected_features'}, axis=1)
    )

    # merge back to original df
    df = df.merge(n_detected_features_df, how='left', on=gene_col)

    return df

compute_perplexity(df)

Compute perplexity per gene based on Shannon entropy.

Parameters

df : pd.DataFrame DataFrame containing 'gene_id' and 'entropy' columns.

Returns

pd.DataFrame DataFrame with an additional column: - 'perplexity': effective number of isoforms per gene

Source code in src/isoplex/utils.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def compute_perplexity(df):
    """
    Compute perplexity per gene based on Shannon entropy.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing 'gene_id' and 'entropy' columns.

    Returns
    -------
    pd.DataFrame
        DataFrame with an additional column:
        - 'perplexity': effective number of isoforms per gene
    """
    # compute perplexity as 2^entropy
    df['perplexity'] = 2 ** df['entropy']
    return df

compute_pi(df, gene_col=GENE_COL)

Generate pi values (isoform ratios) from input expression column.

Parameters

df : pd.DataFrame DataFrame containing at least and 'tpm'. gene_col : str Column representing the gene (default: gene_id)

Returns

pd.DataFrame DataFrame with an additional 'pi' column.

Source code in src/isoplex/utils.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def compute_pi(df, gene_col=GENE_COL):
    """
    Generate pi values (isoform ratios) from input expression column.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing at least <gene_col> and 'tpm'.
    gene_col : str
        Column representing the gene (default: gene_id)

    Returns
    -------
    pd.DataFrame
        DataFrame with an additional 'pi' column.
    """
    df['gene_tpm'] = df.groupby(gene_col)['tpm'].transform('sum')
    df['pi'] = df['tpm'] / df['gene_tpm']
    df = df.drop(columns='gene_tpm')
    return df

compute_tpm(df)

Calculate TPM values from counts for a single-sample (bulk) dataframe and add as a new column to the dataframe.

Parameters

df : pd.DataFrame Input dataframe containing 'counts'.

Returns

pd.DataFrame DataFrame with a new column 'tpm' added.

Source code in src/isoplex/utils.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def compute_tpm(df):
    """
    Calculate TPM values from counts for a single-sample (bulk) dataframe
    and add as a new column to the dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing 'counts'.

    Returns
    -------
    pd.DataFrame
        DataFrame with a new column 'tpm' added.
    """
    df['tpm'] = df['counts'] / df['counts'].sum() * 1e6

    return df

flatten_list(l)

Flatten a list into 1 dimension.

Parameters

l : list

Source code in src/isoplex/utils.py
605
606
607
608
609
610
611
612
613
def flatten_list(l):
    """
    Flatten a list into 1 dimension.

    Parameters
    ----------
    l : list
    """
    return [j for i in l for j in i]

rename_sample_col(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL, expression_type=EXP_COL)

Source code in src/isoplex/utils.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def rename_sample_col( df: pd.DataFrame,
                       gene_col: str = GENE_COL,
                       feature_col: str = TRANSCRIPT_COL,
                       expression_type: str = EXP_COL):
    """
    """
    sample_cols = [c for c in df.columns if c not in [gene_col,
                                                      feature_col]]
    if len(sample_cols)>1:
        raise ValueError('DF has more than one sample')

    sample = sample_cols[0]
    df.rename({sample:expression_type}, axis=1, inplace=True)
    return df

validate_counts_input(df, gene_col=GENE_COL, feature_col=TRANSCRIPT_COL)

Validate a wide-format expression DataFrame.

In wide-format input, rows correspond to features (e.g. transcripts), columns (other than gene_col and feature_col) correspond to samples.

Parameters

df : pd.DataFrame Wide-format expression table. gene_col : str Column name identifying genes. feature_col : str Column name identifying features (e.g., transcripts).

Raises

KeyError If either gene_col or feature_col is missing. ValueError If: - any sample column is non-numeric, - there are missing gene or feature IDs, - expression values contain negatives, - all expression values sum to zero.

Source code in src/isoplex/utils.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def validate_counts_input(
    df: pd.DataFrame,
    gene_col: str = GENE_COL,
    feature_col: str = TRANSCRIPT_COL
) -> bool:
    """
    Validate a wide-format expression DataFrame.

    In wide-format input, rows correspond to features (e.g. transcripts),
    columns (other than `gene_col` and `feature_col`) correspond to samples.

    Parameters
    ----------
    df : pd.DataFrame
        Wide-format expression table.
    gene_col : str
        Column name identifying genes.
    feature_col : str
        Column name identifying features (e.g., transcripts).

    Raises
    ------
    KeyError
        If either `gene_col` or `feature_col` is missing.
    ValueError
        If:
          - any sample column is non-numeric,
          - there are missing gene or feature IDs,
          - expression values contain negatives,
          - all expression values sum to zero.
    """

    # Check that gene and feature columns exist
    required_cols = [gene_col, feature_col]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")

    # Identify all putative sample columns
    sample_cols = [c for c in df.columns if c not in required_cols]
    if not sample_cols:
        raise ValueError("No sample columns found in input.")

    # All sample columns must be numeric
    non_numeric = [c for c in sample_cols
                   if not pd.api.types.is_numeric_dtype(df[c])]
    if non_numeric:
        raise ValueError(
            f"Found non-numeric columns (likely metadata) in input: {non_numeric}"
        )

    # Check for missing identifiers
    if df[gene_col].isna().any():
        raise ValueError(f"Missing values in `{gene_col}` column.")
    if df[feature_col].isna().any():
        raise ValueError(f"Missing values in `{feature_col}` column.")

    # Check that expression values are non-negative and not all zero
    expr_df = df[sample_cols]
    if (expr_df < 0).any().any():
        raise ValueError("Expression values contain negative numbers.")
    if expr_df.sum().sum() <= 0:
        raise ValueError("Total expression counts sum to zero or less.")

    return True