Skip to content

Commit

Permalink
add docstrings, licenses etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
sam-the-programmer committed Aug 7, 2023
1 parent c2cbe89 commit 6d865cc
Show file tree
Hide file tree
Showing 28 changed files with 616 additions and 46 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
**/*.egg-info/**
dist/**
**/__pycache__/**
**/__pycache__/**
test.py
15 changes: 15 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[MESSAGES CONTROL]
disable =
too-few-public-methods,
relative-beyond-top-level,
invalid-name,
too-many-arguments

[REPORTS]
output-format = colorized

[FORMAT]
max-line-length=120

[MASTER]
ignore-patterns=test*
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
],
"python.testing.pytestEnabled": false,
"python.testing.unittestEnabled": true,
"python.linting.enabled": true,
"pylint.args": [
"--rcfile=.pylintrc"
],
"files.exclude": {
"**/__pycache__": true,
}
Expand Down
6 changes: 6 additions & 0 deletions castle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# for the sandcastle build tool (https://github.com/sam-the-programmer/sandcastle)
tasks:
lint:
- isort .
- black .
- pylint --rcfile=.pylintrc ./src
11 changes: 11 additions & 0 deletions src/circleml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""CircleML is a machine learning library for Python that's fully PyPy compatible.
Submodules:
- cluster: clustering algorithms
- core: core functions and classes
- knn: k-nearest-neighbours algorithm
- log: logging
- metrics: metrics
"""

from .core.errors import ShapeError, check_len
from .core.math import euclidean_distance, hamming_distance, manhattan_distance
from .log import *
22 changes: 22 additions & 0 deletions src/circleml/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2023 CircleML GitHub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Implementations of clustering algorithms for unsupervised classification.
Submodules:
- dbscan: DBScan algorithm
"""

from .dbscan.dbscan import DBScanCla
22 changes: 22 additions & 0 deletions src/circleml/cluster/dbscan/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2023 CircleML GitHub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Implementation of the DBScan algorithm.
# Classes
- DBScanCla
"""

from .dbscan import DBScanCla
112 changes: 112 additions & 0 deletions src/circleml/cluster/dbscan/dbscan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright 2023 CircleML GitHub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Internal DBScan module. Use circleml.cluster.dbscan instead."""

import typing as t
from collections import namedtuple

import numpy as np

from ... import log
from ...core import euclidean_distance
from ...core.base import UnsupervisedModelABC
from ...core.graph import Graph, Node

dbscan_sample = namedtuple("dbscan_sample", ["val", "cor", "lab", "dead"])


class DBScanCla(UnsupervisedModelABC):
"""The DBScan clustering algorithm, which finds samples with lots of neighbours to find clusters."""

def __init__(
self,
min_dist: float = 0.5,
min_neighbours: int = 4,
distance_func: t.Callable[[t.Sized], float] = euclidean_distance,
) -> None:
self.min_dist = min_dist
self.min_neighbours = min_neighbours
self.distance_func = distance_func
self.__preds = []

def __find_neighbours(self, X: np.ndarray, sample: np.ndarray) -> np.ndarray:
return [
i
for i in range(len(X))
if self.distance_func(X[i], sample) <= self.min_dist
]

def fit(self, X: np.ndarray, verbose: bool = False) -> "DBScanCla":
"""Fit the DBScan algorithm to the data.
Args:
X (np.ndarray): The data to fit to.
Returns:
np.ndarray: The labels of the data.
"""
logger = log.create_logger(log.info, verbose=verbose)
logger("Constructing search graph")
gr = Graph(
[
Node(dbscan_sample(cor=False, val=sample, dead=False, lab=-1))
for sample in X
]
)

logger("Finding neighbours")
for i, _ in enumerate(gr.nodes):
ns = self.__find_neighbours(X, gr.nodes[i].val.val)
gr.nodes[i].set_edges(ns)

# if the node has enough neighbours, it is a core node
if len(ns) >= self.min_neighbours:
gr.nodes[i].val = gr.nodes[i].val._replace(cor=True)

logger("Selecting core samples")
# find all the core nodes
core_nodes = [i for i, node in enumerate(gr.nodes) if node.val.cor]

logger("Labelling samples")
current_class = 0
for node in core_nodes:
n = gr.nodes[node]
if n.val.dead: # already visited
continue

# find all the neighbours of the core node
self.__label_node(n, current_class, gr)
current_class += 1

self.__preds = [node.val.lab for node in gr.nodes]
return self

def __label_node(self, node: Node, label: int, gr: Graph) -> None:
"""Label a node and all its neighbours."""
if not node.val.dead:
node.val = node.val._replace(lab=label, dead=True)

if node.val.cor:
for n in node.edges:
self.__label_node(gr.nodes[n], label, gr)

def predict(self, verbose: bool = False) -> np.ndarray:
"""Predict the labels of the data.
Returns:
np.ndarray: The labels of the data.
"""
return self.__preds
3 changes: 3 additions & 0 deletions src/circleml/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Core functions and tools, including errors, shape-checkers and mathematical functions."""

from .errors import ShapeError, check_len
from .math import euclidean_distance, hamming_distance, manhattan_distance
5 changes: 4 additions & 1 deletion src/circleml/core/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from .abcs import ModelABC

"""Base classes and ABCS."""

from .abcs import Module, SupervisedModelABC, TransformationABC, UnsupervisedModelABC
76 changes: 71 additions & 5 deletions src/circleml/core/base/abcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,83 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import annotations

"""Abstract base classes for use to ensure compatability with pipelines and other modular components.
# Abstract Base Classes
- Module: Abstract base class for pipeline modules
- ModelABC: Abstract base class for models
"""


from abc import ABC, abstractmethod
from typing import Any

class ModelABC(ABC):
"""Abstract base class for all classifiers."""

class Module(ABC):
"""Abstract base class for pipeline modules."""

@abstractmethod
def __call__(self, X: Any) -> Any:
return X


class SupervisedModelABC(Module):
"""Abstract base class for models."""

@abstractmethod
def fit(self, X, y, verbose: bool=False) -> ModelABC:
def fit(self, X, y, verbose: bool = False) -> "SupervisedModelABC":
"""Fit the model to the data."""

@abstractmethod
def predict(self, X, verbose: bool=False) -> Any:
def predict(self, X, verbose: bool = False) -> Any:
"""Predict the class of the data."""

def __call__(self, X: Any, verbose: bool = False) -> Any:
return self.predict(X, verbose=verbose)

def fit_predict(self, X, y, verbose: bool = False) -> Any:
"""Fit the model to the data and predict the class of the data."""
return self.fit(X, y, verbose).predict(X, verbose)


class UnsupervisedModelABC(Module):
"""Abstract base class for models."""

@abstractmethod
def fit(self, X, verbose: bool = False) -> "UnsupervisedModelABC":
"""Fit the model to the data."""

@abstractmethod
def predict(self, verbose: bool = False) -> Any:
"""Predict the class of the data."""

def __call__(self, X: Any, verbose: bool = False) -> Any:
return self.fit_predict(X, verbose=verbose)

def fit_predict(self, X, verbose: bool = False) -> Any:
"""Fit the model to the data and predict the class of the data."""
return self.fit(X, verbose=verbose).predict(verbose=verbose)


class TransformationABC(Module):
"""Abstract base class for transformations."""

@abstractmethod
def fit(self, X, verbose: bool = False) -> "TransformationABC":
"""Fit the transformation to the data."""

@abstractmethod
def transform(self, X, verbose: bool = False) -> Any:
"""Transform the data."""

@abstractmethod
def inverse_transform(self, X, verbose: bool = False) -> Any:
"""Inverse transform the data."""

def __call__(self, X: Any, verbose: bool = False) -> Any:
return self.transform(X, verbose=verbose)

def fit_transform(self, X, verbose: bool = False) -> Any:
"""Fit the transformation to the data and transform the data."""
return self.fit(X, verbose=verbose).transform(X, verbose=verbose)
12 changes: 11 additions & 1 deletion src/circleml/core/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from .errors import ShapeError, check_len

"""Errors and checkers for use in ML algorithms.
# Errors
- ShapeError: raised when the shape of a numpy array is incorrect
# Checkers
- check_len: checks the length of a list or numpy array are equal, else, raises error
"""

from .errors import ShapeError, check_len
21 changes: 20 additions & 1 deletion src/circleml/core/errors/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Internal error module. Use circleml.core.errors instead."""

import typing as t

from ...log import check_err


class ShapeError(ValueError):
"""The given shape is invalid or mismatched."""

check_len = lambda x, y: check_err(len(x) == len(y), "y_true and y_pred must have the same length", ShapeError)

def check_len(x: t.Sized, y: t.Sized) -> None:
"""Check that two objects have the same length.
Args:
x (t.Sized): first object
y (t.Sized): second object
Raises:
ShapeError: if the lengths are not equal
"""
check_err(
len(x) == len(y), "y_true and y_pred must have the same length", ShapeError
)
23 changes: 23 additions & 0 deletions src/circleml/core/graph/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright 2023 CircleML GitHub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""An implementation of a graph data structure.
# Classes
- Graph
- Node
"""

from .graph import Graph, Node
Loading

0 comments on commit 6d865cc

Please sign in to comment.