Skip to content

subset_comparisons

handle_filters(include, include_file)

Creates a set including both the given iterable and file's contents.

Source code in src/rna_clique/subset_comparisons.py
def handle_filters(include: Iterable[str], include_file: Path) -> set[str]:
    """Creates a set including both the given iterable and file's contents."""
    include = set(include)
    try:
        with open(include_file, "r") as filter_file:
            include |= set(l.rstrip() for l in filter_file)
    except TypeError:
        pass
    return include

make_subset_comparisons(inputs, output_dir, matches)

Creates symlinks to stored dataframes whose samples satisfy a predicate.

Parameters:

Name Type Description Default
inputs Iterable[Path]

The Paths to the input dataframe pickles.

required
output_dir Path

The directory in which to create the symlinks.

required
matches Callable[[Path], bool]

Function giving whether a sample's Path is included.

required

Returns:

Type Description
Iterator[DataFrame]

A generator yielding the dataframes whose samples satisfy the predicate.

Source code in src/rna_clique/subset_comparisons.py
def make_subset_comparisons(
        inputs: Iterable[Path],
        output_dir: Path,
        matches: Callable[[Path], bool],
) -> Iterator[pd.DataFrame]:
    """Creates symlinks to stored dataframes whose samples satisfy a predicate.

    Parameters:
        inputs:            The Paths to the input dataframe pickles.
        output_dir:        The directory in which to create the symlinks.
        matches:           Function giving whether a sample's Path is included.

    Returns:
        A generator yielding the dataframes whose samples satisfy the predicate.
    """
    for df_path in inputs:
        df = read_table(df_path, head=1, head_unsupported=False)
        if all(
                matches(Path(df[x + "sample"].iloc[0]))
                for x in ["q", "s"]
        ):
            # We only need to re-read if it looks like we headed the table the
            # first time.
            if df.shape[0] == 1:
                df = read_table(df_path)
            dest = output_dir / df_path.name
            rt = relative_to(df_path, dest.parent)
            dest.symlink_to(rt)
            yield df

matcher(included=None, excluded=None, include_regex=None)

Returns a function that checks if a string meets certain criteria.

Specifically, the returned function returns a bool indicating whether its argument is in the provided Container or matches the given regex.

Parameters:

Name Type Description Default
included Optional[Container[str]]

A container of strings to be included.

None
excluded Optional[Container[str]]

A container of strings to be excluded.

None
include_regex Optional[Pattern]

A regular expression to match for inclusion.

None

Returns:

Type Description
Callable[[str], bool]

A function that checks if a str matches the regex or included strings.

Source code in src/rna_clique/subset_comparisons.py
def matcher(
        included: Optional[Container[str]] = None,
        excluded: Optional[Container[str]] = None,
        include_regex: Optional[re.Pattern] = None
) -> Callable[[str], bool]:
    """Returns a function that checks if a string meets certain criteria.

    Specifically, the returned function returns a bool indicating whether its
    argument is in the provided Container or matches the given regex.

    Parameters:
        included:      A container of strings to be included.
        excluded:      A container of strings to be excluded.
        include_regex: A regular expression to match for inclusion.

    Returns:
        A function that checks if a str matches the regex or included strings.
    """
    def inner(x):
        return (
            (included is None and include_regex is None) or \
            (included is not None and x in included) or \
            (include_regex is not None and bool(include_regex.search(x)))
        ) and (excluded is None or x not in excluded)
    return inner

relative_to(p1, p2)

Returns the first path relative to the second.

Source code in src/rna_clique/subset_comparisons.py
def relative_to(p1: Path, p2: Path) -> Path:
    """Returns the first path relative to the second."""
    return Path(os.path.relpath(str(p1), str(p2)))