Classes
m load()

Module preprocessor.package

A Package is a single file used to hold one or more files of data. The Package is essentially a .zip archive with several specific files inside it to define metadata about the package. Each package contains a file named ".meta.json" as well as the data itself.

Classes

class Meta (version: MetaVersion, last_modified: datetime.datetime, record: Union[MetaRecordCsvMetaRecordSqlMetaRecordMongoMetaRecordModelMetaRecordJsonMetaRecordWordEmbeddingMetaRecordAWSs3BucketStorageMetaRecordAzureDataLakeStorageMetaRecordDatabaseReport])

Meta(version: preprocessor.package.MetaVersion, last_modified: datetime.datetime, record: Union[preprocessor.package.MetaRecordCsv, preprocessor.package.MetaRecordSql, preprocessor.package.MetaRecordMongo, preprocessor.package.MetaRecordModel, preprocessor.package.MetaRecordJson, preprocessor.package.MetaRecordWordEmbedding, preprocessor.package.MetaRecordAWSs3BucketStorage, preprocessor.package.MetaRecordAzureDataLakeStorage, preprocessor.package.MetaRecordDatabaseReport])

Ancestors

Class variables

var SCHEMA_V1
var SCHEMA_V2
var SCHEMA_VERSION
var last_modified : datetime.datetime
var record : Union[MetaRecordCsvMetaRecordSqlMetaRecordMongoMetaRecordModelMetaRecordJsonMetaRecordWordEmbeddingMetaRecordAWSs3BucketStorageMetaRecordAzureDataLakeStorageMetaRecordDatabaseReport]
var versionMetaVersion

Static methods

def from_record(valid: dict) -> Union[MetaRecordCsvMetaRecordSqlMetaRecordMongoMetaRecordModelMetaRecordJsonMetaRecordWordEmbeddingMetaRecordAWSs3BucketStorageMetaRecordAzureDataLakeStorageMetaRecordDatabaseReport]
def promote_v2(valid: dict) -> dict
def validate(input: dict) -> dict

Inherited members

class MetaRecordAWSs3BucketStorage (bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str)

MetaRecordAWSs3BucketStorage(bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str)

Class variables

var aws_access_key_id : str
var aws_secret_access_key : str
var bucket_name : str
var object_name : str
var region : str

Static methods

def from_dict(input: dict) -> MetaRecordAWSs3BucketStorage

Methods

def to_dict(self) -> dict
class MetaRecordAzureDataLakeStorage (storage_account_name: str, storage_key: str, file_system: str, path: str)

MetaRecordAzureDataLakeStorage(storage_account_name: str, storage_key: str, file_system: str, path: str)

Class variables

var file_system : str
var path : str
var storage_account_name : str
var storage_key : str

Static methods

def from_dict(input: dict) -> MetaRecordAzureDataLakeStorage

Methods

def get_file_client(self) -> azure.storage.filedatalake._data_lake_file_client.DataLakeFileClient
def to_dict(self) -> dict
class MetaRecordCsv (sheet_path: str, path_column: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)

MetaRecordCsv(sheet_path: str, path_column: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)

Class variables

var path_column : str
var sheet_path : str
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]

Static methods

def from_dict(input: dict) -> MetaRecordCsv

Methods

def to_dict(self) -> dict
class MetaRecordDatabaseReport (query_template: str, params: Dict[str, ReportParameter], connection: str, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None)

MetaRecordDatabaseReport(query_template: str, params: Dict[str, preprocessor.report_parameters.ReportParameter], connection: str, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None)

Class variables

var connection : str
var connection_opts : Optional[dict]
var credentials_info : Optional[dict]
var params : Dict[str, ReportParameter]
var query_template : str

Static methods

def from_dict(input: dict) -> MetaRecordDatabaseReport

Methods

def to_dict(self) -> dict
class MetaRecordJson (data: str)

MetaRecordJson(data: str)

Class variables

var data : str

Static methods

def from_dict(input: dict) -> MetaRecordJson

Methods

def to_dict(self) -> dict
class MetaRecordModel (model_path: str, model_type: str, unrestricted_data: list, reports: ModelReportRecord, logs: list, data_transformers_path: List = None, target_transformers_path: List = None, vocab_path: str = None, target_map_path: str = None)

MetaRecordModel(model_path: str, model_type: str, unrestricted_data: list, reports: preprocessor.package.ModelReportRecord, logs: list, data_transformers_path: List = None, target_transformers_path: List = None, vocab_path: str = None, target_map_path: str = None)

Class variables

var data_transformers_path : List
var logs : list
var model_path : str
var model_type : str
var reportsModelReportRecord
var target_map_path : str
var target_transformers_path : List
var unrestricted_data : list
var vocab_path : str

Static methods

def from_dict(input: dict) -> MetaRecordModel

Methods

def to_dict(self) -> dict
class MetaRecordMongo (query: str, connection: str, database: str, collection: str, projection: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None, limit: Optional[int] = None, sort: Optional[List] = None)

MetaRecordMongo(query: str, connection: str, database: str, collection: str, projection: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None, limit: Optional[int] = None, sort: Optional[List] = None)

Class variables

var collection : str
var connection : str
var database : str
var limit : Optional[int]
var projection : str
var query : str
var sort : Optional[List]
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]

Static methods

def from_dict(input: dict) -> MetaRecordMongo

Methods

def to_dict(self) -> dict
class MetaRecordSql (query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)

MetaRecordSql(query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)

Class variables

var connection : str
var credentials_info : Optional[dict]
var options : Optional[dict]
var query : str
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]

Static methods

def from_dict(input: dict) -> MetaRecordSql

Methods

def to_dict(self) -> dict
class MetaRecordWordEmbedding (vocab_path: str, embedding_path: str)

MetaRecordWordEmbedding(vocab_path: str, embedding_path: str)

Class variables

var embedding_path : str
var vocab_path : str

Static methods

def from_dict(input: dict) -> MetaRecordWordEmbedding

Methods

def to_dict(self) -> dict
class MetaVersion (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.

Ancestors

  • enum.Enum

Class variables

var V1
var V2
class ModelReportRecord (input_shape: List[Union[List[int], int]] = None, output_shape: List[int] = None, model_summary: List[str] = None, library_version: str = None)

ModelReportRecord(input_shape: List[Union[List[int], int]] = None, output_shape: List[int] = None, model_summary: List[str] = None, library_version: str = None)

Class variables

var input_shape : List[Union[List[int], int]]
var library_version : str
var model_summary : List[str]
var output_shape : List[int]

Static methods

def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) -> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) -> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) -> dataclasses_json.mm.SchemaF[~A]

Methods

def to_dict(self, encode_json=False) -> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) -> str
class Package (path: pathlib.Path, meta: Meta, spec: Optional[Spec])

A collection of data for training or computations, along with descriptions of the contents. A Package is essentially an archive (.zip) of files following a special internal structure:

An example Package with simple tabular data would internally look like: filename.zip .meta.json # describes version, creation date, etc some_kind_of_data.csv # the data in this package

Image Package files also contain an internal "records.csv" which associates information such as training labels with images within the package. An example Image Package file would internally look like: filename.zip .meta.json # describes version, creation date, etc records.csv # index of images and labels (for training) images img_001.jpg img_002.jpg

Packages can also contain info to authenticate and query a database, etc.

Class variables

var MANIFEST_FILE
var META_FILE
var SPEC_FILE

Static methods

def create(filename: Union[str, pathlib.Path], record_data: Union[str, pathlib.Path], root: Union[str, pathlib.Path] = None, path_column: Optional[str] = None, label_column: Optional[str] = None, header: Optional[List[str]] = None, spec_override: List[FieldOverride] = [], is_masked: bool = True, unmask_columns: Optional[List[str]] = None, supplemental_named_paths: Optional[Dict[str, str]] = None, supplimental_named_paths: Optional[Dict[str, str]] = None) -> Package

Create a Package using a simple CSV or a CSV describing a folder layout

For the simple case, just define the record_data (the CSV file) and optionally the header list if the first row of the CSV does not hold the name of the columns.

For the more complex case of a folder layout, the CSV is a list of files and must contain several specific columns. * path_column (required): Name of the column holding the associated data file path/filenames * root (optional): The root from which the above paths are relative. If None, paths are relative to the CSV itself. * label_column (optional): Name of the column holding a label describing each file. If there are multiple labels per file, use JSON format to specify the list.

Args

filename : Union[str, Path]
Filename of the Package to create
record_data : Union[str, Path]
Filename of data used to populate this Package.
root : Union[str, Path], optional
Path to the root of the data folder. Default is None
path_column : str, optional
Name of the column in the record_data file which contains paths to data files. If None, the record_data is treated as a simple tabular data file.
label_column : str, optional
Name of the label column. When a path_column exists, this column holds labels associated with the file in the path_column. Multi-label datasets need to be in JSON format.
header : List[str], optional
A list of column names. If None, the first row of the CSV will be used as a header.
spec_override (List[FieldOverride]) = [],
is_masked : bool
Whether or not the data is masked.
unmask_columns : [str], optional
List of individual fields to unmask.
supplemental_named_paths : [Dict[str,str]], optional
This is a dictionary of name:path indicating files to be included in the package
supplimental_named_paths : [Dict[str,str]], optional
Do not use, backwards compatibility for typo.

Returns

Package
The archive object
def create_database_report(filename: Union[str, pathlib.Path], query_template: str, params: Dict[str, ReportParameter], connection: str, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None) -> Package

Validate and create a database report Package.

Args

filename : Union[str, Path]
Filename of the package to be created.
query_template : str
The SQL query template containing Mustache template parameters for the report.
params : Dict[str, ReportParameter]
Parameters for this report.
connection : str
The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
connection_opts : dict, optional
Dictionary of database connection options.
credentials_info : dict, optional
Dictionary of credentials information if not provided in the connection string.

Raises

Exception
Query template must not be blank.
Exception
Report parameters are required.
Exception
Invalid param_type for ReportParameter …
Exception
PARAM does not appear in the query template.
Exception
PARAM missing from missing from params
Exception
Name must be unique for each parameter, PARAM reused.

Returns

Package
The created report package object
def create_from_database(filename: Union[str, pathlib.Path], query: str, connection: str) -> Package

Define a package extracted from a database-held dataset.

Args

filename : Union[str, Path]
Filename of the package to be created.
query : str
The SQL query used to collect the dataset.
connection : str
The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]

Returns

Package
The archive object
def from_aws_s3_bucket_storage(filename: Union[str, pathlib.Path], bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str) -> Package

Create a package file referencing an AWS S3 Bucket Storage data file

Args

filename : Union[str, Path]
Filename of the package to be created.
bucket_name : str
Name of the AWS S3 Bucket containing the data file
region : str
The AWS region
object_name : str
The file name, know as object or key in AWS S3
aws_access_key_id : str
Access key for this account, region, bucket
aws_secret_access_key : str
Secret access key for this account, region, bucket

Returns

Package
The created Package object
def from_azure_blob_storage(filename: Union[str, pathlib.Path], storage_account_name: str, storage_key: str, file_system: str, key: str) -> Package

Create a package file referencing a Azure Blob Storage data file

Args

filename : Union[str, Path]
Filename of the package to be created.
storage_account_name : str
The Azure storage account to reference.
storage_key : str
Access token used when pulling files from the storage account.
file_system : str
File system defined in the Azure control panel for the storage account.
path : str
The full path to the file that will be downloaded.

Returns

Package
The created Package object
def from_azure_data_lake_storage(filename: Union[str, pathlib.Path], storage_account_name: str, storage_key: str, file_system: str, path: str) -> Package

Create a package file referencing a Azure Data Lake Storage data file

Args

filename : Union[str, Path]
Filename of the package to be created.
storage_account_name : str
The Azure storage account to reference.
storage_key : str
Access token used when pulling files from the storage account.
file_system : str
File system defined in the Azure control panel for the storage account.
path : str
The full path to the file that will be downloaded.

Returns

Package
The created Package object
def from_image_dataset_folder(output_zip: Union[str, pathlib.Path], path: Union[str, pathlib.Path]) -> Package

Create package from torch style image dataset folder structure

NOTE: labels must be numeric values

Assumes structure: path/ / imgs / imgs

Args

output_zip : Union[str, Path]
Path of output zipfile
path : Union[str,Path]
Path to folder structure

Returns

(Package): Package file holding the given input data.

def from_json(filename: Union[str, pathlib.Path], data: Or()) -> Package

Create package from JSON file.

Args

filename : Union[str, Path]
Filename of the package to be created.
data : str
Filename of the data to package

Returns

Package
The created Package object
def from_model(filename: Union[str, pathlib.Path], model_type: str, model_path: Or(), unrestricted_data: List[str] = [], reports: Optional[ModelReportRecord] = None, logs: List[str] = [], data_transformers_path: Union[str, pathlib.Path] = '', target_transformers_path: Union[str, pathlib.Path] = '', vocab_path: Union[str, pathlib.Path] = '', target_map_path: Union[str, pathlib.Path] = '', validation_hash: Optional[str] = None) -> Package

Create package from model file.

Args

filename : Union[str, Path]
Filename of the package to create
model_type : str
Model format, i.e. "torch", "keras", etc.
model_path : str
Current location of model to archive

Returns

Package
The created Package object
def from_numpy(output_zip: Union[str, pathlib.Path], X: Union[, str, pathlib.Path], y: Union[List, , str, pathlib.Path] = None) -> Package

Prepare a single data file from numpy as an appropriately structured Package

Args

output_zip : Union[str, Path]
Path of Package to create
X : np.array
Training data
y : np.array
Training labels

Returns

(Package): Package file holding the given input data.

def from_single_file(output: Union[str, pathlib.Path], input: Union[str, pathlib.Path], is_masked: bool = True, unmask_columns: Optional[List[str]] = None) -> Tuple[Package, bool]

Prepare a single data file as an appropriately structured Package

Args

output : Union[str, Path]
Path of Package to create
input : Union[str, Path]
Path of data to be placed into the Package
is_masked : bool
Whether or not the data is masked.
unmask_columns : [str], optional
List of column names that are unmasked. Default to to mask all columns.

Raises

Exception
Unable to ascertain the proper Package to hold the data

Returns

(Package, bool): Package file holding the given input data, plus a boolean indicating if it is a package of images.

def from_word_embedding(filename: Union[str, pathlib.Path], embedding_path: Union[str, pathlib.Path], vocab_path: Union[str, pathlib.Path] = '') -> Package

Create word embedding package for training

Args

filename : Union[str, Path]
Filename of the package to be created
embedding_path : Union[str, Path]
Path to the source word embedding
vocab_path : Union[str, Path]
Path to the source vocabulary for embedding

Returns

Package
The created Package object
def load(path: Union[str, pathlib.Path], validation_hash: Optional[str] = None) -> Package

Instantiates a Package object from a file

Args

path : Union[str, Path]
The file, must already be in package format
validation_hash : Optional[str], optional
The expected hash of the

package contents. The router stores the hash at asset registration. Defaults to None. Will bypass check.

Returns

Package
A Package instance
def reference_from_database(filename: Union[str, pathlib.Path], query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None) -> Package

Define a package referring to a database-held dataset.

Args

filename : Union[str, Path]
Filename of the package to be created.
query : str
The SQL query which will be used to collect the dataset.
connection : str
The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
options : Optional, dict
Dictionary of database connection options.
credentials_info : Optional, dict
Dictionary of credentials information if not provided in the connection string.

Returns

Package
The archive object
def reference_from_mongo_database(filename: Union[str, pathlib.Path], query: str, connection: str, database: str, collection: str, projection: dict = {}, limit: Optional[int] = None, sort: Optional[List] = None) -> Package

Define a package referring to a database-held dataset.

Args

filename : Union[str, Path]
Filename of the package to be created.
query : str
JSON dictionary which is compatible with pymongo.
connection : str
Mongo connection uri. See: [https://docs.mongodb.com/manual/reference/connection-string/]

Returns

Package
The archive object

Instance variables

var filename : str
var metaMeta
var path : pathlib.Path
var specSpec

Methods

def create_sqlalchemy_engine(self) -> sqlalchemy.engine.base.Engine

Create a SQLAlchemy engine for the package's database

Returns

sqlalchemy.engine.Engine
The engine
def get_data_transforms(self)
def get_model_misclassifications(self) -> List[pandas.core.frame.DataFrame]

Get information about failed test cases during model training

If the final model failed to produce correct results for any of the labeled test data, a sample of "unrestricted" information about those failures is returned. The unrestricted data is declared by the dataset owner when they mark a data column as Unmasked.

A maximum of 10 records are returned per client.

Returns

List[pd.Dataframe]
Dataframes holding the unmasked data for failures, up to one per client
def get_package_type(self) -> PackageType

Get the category of the packaged data

Returns

PackageType
An indication of the content of the package
def get_target_mapping(self)
def get_target_transforms(self)
def get_vocab(self)
def get_word_embedding(self)
def hash_contents(self)

Hashes the contents of the Package

For all Package types, hash the concatenated CRC-32 values of files in the package, excluding spec files. For database variant Packages, the database query is stored in metadata and also gets hashed.

Note: The hash is not stored in the Package.

Returns

string
hexdigest of sha256 hash
def iter_records(self) -> PackageIterator
def model(self)

Extract model contained in package to memory

Possible model_types include: * keras * pytorch * sklearn * recommender * onnx: A ModelProto object * xgboost: xgboost.XGBClassifier or xgboost.XGBRegressor * pmml_regression: A privophy.RegressionModel or privophy.GeneralRegressionModel object * pmml_tree * network_builder: JSON describing the TripleBlind model (e.g. split NN, vertical network)

Returns

Or[Pytorch, Keras, SKlearn, XGBoost, Recommender models, PMMLRegression, JSON]
The model
def model_pointer(self) -> Tuple[str, object]

Return model type and file pointer directly to model file path inside of the zip file. model_types include: keras, pytorch, sklearn, recommender, and xgboost

Returns

Tuple[model_type as string, zip file pointer to model path]

def perform_database_report(self, report_values) -> pandas.core.frame.DataFrame
def populate_spec(self, force: bool = False)
def record_data(self) -> pandas.core.frame.DataFrame
def record_data_as_file(self)
def records(self) -> pandas.core.frame.DataFrame
def records_chunked(self, chunksize: int) -> Iterator[pandas.core.frame.DataFrame]
def regenerate_spec(self)
def store_synthesizer(self, synthesizer, synth_type: str, synth_version: Optional[str] = None)

Stores the trained synthesizer for the package

Args

synthesizer
The synthesizer model to store in the package.
synth_type : str

The type of the synthesizer model to store.

Supported synthesizer types include: * CTGAN * LEGACY

synth_version : str
The version of the synthesizer model to store.
async def substitute_connection_secrets(self, secret_store)

Use the Access Point provided secret store to replace handlebar variables in connection strings.

def substitute_connection_secrets_sync(self, secret_store)
def synthesizer(self)

Returns data synthesizer model in memory

Supported synthesizer types include: * CTGAN

def validate_db_connection(self)
def validate_sql(self)

Run an SQL linter on the query to validate syntax.

Raises

ValueError
Failed SQLFluff linter. Content is a list of error strings in the format: ["{line number}.{column number}: {error message}", …]
class PackageIterator (parent: Package, zip: zipfile.ZipFile, df: pandas.core.frame.DataFrame)

Helper for walking through the contents of a Package file

Ancestors

  • collections.abc.Iterator
  • collections.abc.Iterable
  • typing.Generic
class PackageType (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.

Ancestors

  • enum.Enum

Class variables

var AWS_S3_BUCKET_STORAGE
var AZURE_DATA_LAKE_STORAGE
var CSV
var DATABASE_REPORT
var JSON
var MODEL
var MONGO
var SQL
var WORD_EMBEDDING