Module preprocessor.package
A Package
is a single file used to hold one or more files of data.
The
Package is essentially a .zip archive with several specific files inside it
to define metadata about the package. Each package contains a file named
".meta.json" as well as the data itself.
Classes
class Meta (version: MetaVersion, last_modified: datetime.datetime, record: Union[MetaRecordCsv, MetaRecordSql, MetaRecordMongo, MetaRecordModel, MetaRecordJson, MetaRecordWordEmbedding, MetaRecordAWSs3BucketStorage, MetaRecordAzureDataLakeStorage, MetaRecordDatabaseReport])
-
Meta(version: preprocessor.package.MetaVersion, last_modified: datetime.datetime, record: Union[preprocessor.package.MetaRecordCsv, preprocessor.package.MetaRecordSql, preprocessor.package.MetaRecordMongo, preprocessor.package.MetaRecordModel, preprocessor.package.MetaRecordJson, preprocessor.package.MetaRecordWordEmbedding, preprocessor.package.MetaRecordAWSs3BucketStorage, preprocessor.package.MetaRecordAzureDataLakeStorage, preprocessor.package.MetaRecordDatabaseReport])
Ancestors
- IsDict
- abc.ABC
Class variables
var SCHEMA_V1
var SCHEMA_V2
var SCHEMA_VERSION
var last_modified : datetime.datetime
var record : Union[MetaRecordCsv, MetaRecordSql, MetaRecordMongo, MetaRecordModel, MetaRecordJson, MetaRecordWordEmbedding, MetaRecordAWSs3BucketStorage, MetaRecordAzureDataLakeStorage, MetaRecordDatabaseReport]
var version : MetaVersion
Static methods
def from_record(valid: dict) -> Union[MetaRecordCsv, MetaRecordSql, MetaRecordMongo, MetaRecordModel, MetaRecordJson, MetaRecordWordEmbedding, MetaRecordAWSs3BucketStorage, MetaRecordAzureDataLakeStorage, MetaRecordDatabaseReport]
def promote_v2(valid: dict) -> dict
def validate(input: dict) -> dict
Inherited members
class MetaRecordAWSs3BucketStorage (bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str)
-
MetaRecordAWSs3BucketStorage(bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str)
Class variables
var aws_access_key_id : str
var aws_secret_access_key : str
var bucket_name : str
var object_name : str
var region : str
Static methods
def from_dict(input: dict) -> MetaRecordAWSs3BucketStorage
Methods
def to_dict(self) -> dict
class MetaRecordAzureDataLakeStorage (storage_account_name: str, storage_key: str, file_system: str, path: str)
-
MetaRecordAzureDataLakeStorage(storage_account_name: str, storage_key: str, file_system: str, path: str)
Class variables
var file_system : str
var path : str
var storage_account_name : str
var storage_key : str
Static methods
def from_dict(input: dict) -> MetaRecordAzureDataLakeStorage
Methods
def get_file_client(self) -> azure.storage.filedatalake._data_lake_file_client.DataLakeFileClient
def to_dict(self) -> dict
class MetaRecordCsv (sheet_path: str, path_column: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
-
MetaRecordCsv(sheet_path: str, path_column: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
Class variables
var path_column : str
var sheet_path : str
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordCsv
Methods
def to_dict(self) -> dict
class MetaRecordDatabaseReport (query_template: str, params: Dict[str, ReportParameter], connection: str, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None)
-
MetaRecordDatabaseReport(query_template: str, params: Dict[str, preprocessor.report_parameters.ReportParameter], connection: str, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None)
Class variables
var connection : str
var connection_opts : Optional[dict]
var credentials_info : Optional[dict]
var params : Dict[str, ReportParameter]
var query_template : str
Static methods
def from_dict(input: dict) -> MetaRecordDatabaseReport
Methods
def to_dict(self) -> dict
class MetaRecordJson (data: str)
-
MetaRecordJson(data: str)
Class variables
var data : str
Static methods
def from_dict(input: dict) -> MetaRecordJson
Methods
def to_dict(self) -> dict
class MetaRecordModel (model_path: str, model_type: str, unrestricted_data: list, reports: ModelReportRecord, logs: list, data_transformers_path: List = None, target_transformers_path: List = None, vocab_path: str = None, target_map_path: str = None)
-
MetaRecordModel(model_path: str, model_type: str, unrestricted_data: list, reports: preprocessor.package.ModelReportRecord, logs: list, data_transformers_path: List = None, target_transformers_path: List = None, vocab_path: str = None, target_map_path: str = None)
Class variables
var data_transformers_path : List
var logs : list
var model_path : str
var model_type : str
var reports : ModelReportRecord
var target_map_path : str
var target_transformers_path : List
var unrestricted_data : list
var vocab_path : str
Static methods
def from_dict(input: dict) -> MetaRecordModel
Methods
def to_dict(self) -> dict
class MetaRecordMongo (query: str, connection: str, database: str, collection: str, projection: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None, limit: Optional[int] = None, sort: Optional[List] = None)
-
MetaRecordMongo(query: str, connection: str, database: str, collection: str, projection: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None, limit: Optional[int] = None, sort: Optional[List] = None)
Class variables
var collection : str
var connection : str
var database : str
var limit : Optional[int]
var projection : str
var query : str
var sort : Optional[List]
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordMongo
Methods
def to_dict(self) -> dict
class MetaRecordSql (query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
-
MetaRecordSql(query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
Class variables
var connection : str
var credentials_info : Optional[dict]
var options : Optional[dict]
var query : str
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordSql
Methods
def to_dict(self) -> dict
class MetaRecordWordEmbedding (vocab_path: str, embedding_path: str)
-
MetaRecordWordEmbedding(vocab_path: str, embedding_path: str)
Class variables
var embedding_path : str
var vocab_path : str
Static methods
def from_dict(input: dict) -> MetaRecordWordEmbedding
Methods
def to_dict(self) -> dict
class MetaVersion (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Ancestors
- enum.Enum
Class variables
var V1
var V2
class ModelReportRecord (input_shape: List[Union[List[int], int]] = None, output_shape: List[int] = None, model_summary: List[str] = None, library_version: str = None)
-
ModelReportRecord(input_shape: List[Union[List[int], int]] = None, output_shape: List[int] = None, model_summary: List[str] = None, library_version: str = None)
Class variables
var input_shape : List[Union[List[int], int]]
var library_version : str
var model_summary : List[str]
var output_shape : List[int]
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) -> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) -> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) -> dataclasses_json.mm.SchemaF[~A]
Methods
def to_dict(self, encode_json=False) -> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) -> str
class Package (path: pathlib.Path, meta: Meta, spec: Optional[Spec])
-
A collection of data for training or computations, along with descriptions of the contents. A Package is essentially an archive (.zip) of files following a special internal structure:
An example Package with simple tabular data would internally look like: filename.zip .meta.json # describes version, creation date, etc some_kind_of_data.csv # the data in this package
Image Package files also contain an internal "records.csv" which associates information such as training labels with images within the package. An example Image Package file would internally look like: filename.zip .meta.json # describes version, creation date, etc records.csv # index of images and labels (for training) images img_001.jpg img_002.jpg
Packages can also contain info to authenticate and query a database, etc.
Class variables
var MANIFEST_FILE
var META_FILE
var SPEC_FILE
Static methods
def create(filename: Union[str, pathlib.Path], record_data: Union[str, pathlib.Path], root: Union[str, pathlib.Path] = None, path_column: Optional[str] = None, label_column: Optional[str] = None, header: Optional[List[str]] = None, spec_override: List[FieldOverride] = [], is_masked: bool = True, unmask_columns: Optional[List[str]] = None, supplemental_named_paths: Optional[Dict[str, str]] = None, supplimental_named_paths: Optional[Dict[str, str]] = None) -> Package
-
Create a Package using a simple CSV or a CSV describing a folder layout
For the simple case, just define the
record_data
(the CSV file) and optionally theheader
list if the first row of the CSV does not hold the name of the columns.For the more complex case of a folder layout, the CSV is a list of files and must contain several specific columns. * path_column (required): Name of the column holding the associated data file path/filenames * root (optional): The root from which the above paths are relative. If None, paths are relative to the CSV itself. * label_column (optional): Name of the column holding a label describing each file. If there are multiple labels per file, use JSON format to specify the list.
Args
filename
:Union[str, Path]
- Filename of the Package to create
record_data
:Union[str, Path]
- Filename of data used to populate this Package.
root
:Union[str, Path]
, optional- Path to the root of the data folder. Default is None
path_column
:str
, optional- Name of the column in the record_data file which contains paths to data files. If None, the record_data is treated as a simple tabular data file.
label_column
:str
, optional- Name of the label column. When a path_column exists, this column holds labels associated with the file in the path_column. Multi-label datasets need to be in JSON format.
header
:List[str]
, optional- A list of column names. If None, the first row of the CSV will be used as a header.
- spec_override (List[FieldOverride]) = [],
is_masked
:bool
- Whether or not the data is masked.
unmask_columns
:[str]
, optional- List of individual fields to unmask.
supplemental_named_paths
:[Dict[str,str]]
, optional- This is a dictionary of name:path indicating files to be included in the package
supplimental_named_paths
:[Dict[str,str]]
, optional- Do not use, backwards compatibility for typo.
Returns
Package
- The archive object
def create_database_report(filename: Union[str, pathlib.Path], query_template: str, params: Dict[str, ReportParameter], connection: str, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None) -> Package
def create_from_database(filename: Union[str, pathlib.Path], query: str, connection: str) -> Package
-
Define a package extracted from a database-held dataset.
Args
filename
:Union[str, Path]
- Filename of the package archive to create
query
:str
- The SQL query used to collect the dataset.
connection
:str
- The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
Returns
Package
- The archive object
def from_aws_s3_bucket_storage(filename: Union[str, pathlib.Path], bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str) -> Package
-
Create a package file that references a file stored in AWS S3 Bucket Storage
Args
filename
:Union[str, Path]
- Filename of the package archive to create
bucket_name
:str
- Name of the AWS S3 Bucket containing the file
region
:str
- The AWS region
object_name
:str
- The file name, know as object or key in AWS S3 lingo
aws_access_key_id
:str
- Access key for this account, region, bucket
aws_secret_access_key
:str
- Secret access key for this account, region, bucket
Returns
Package
- The archive object
def from_azure_blob_storage(filename: Union[str, pathlib.Path], storage_account_name: str, storage_key: str, file_system: str, key: str) -> Package
-
Create a package file that references a table stored in Azure Data Lake Storage
Args
filename
:Union[str, Path]
- Filename of the package archive to create
storage_account_name
:str
- The Azure storage account to reference.
storage_key
:str
- Access token used when pulling files from the storage account.
file_system
:str
- File system defined in the Azure control panel for the storage account.
path
:str
- The full path to the file that will be downloaded.
Returns
Package
- The archive object
def from_azure_data_lake_storage(filename: Union[str, pathlib.Path], storage_account_name: str, storage_key: str, file_system: str, path: str) -> Package
-
Create a package file that references a table stored in Azure Data Lake Storage
Args
filename
:Union[str, Path]
- Filename of the package archive to create
storage_account_name
:str
- The Azure storage account to reference.
storage_key
:str
- Access token used when pulling files from the storage account.
file_system
:str
- File system defined in the Azure control panel for the storage account.
path
:str
- The full path to the file that will be downloaded.
Returns
Package
- The archive object
def from_image_dataset_folder(output_zip: Union[str, pathlib.Path], path: Union[str, pathlib.Path]) -> Package
-
Create package from torch style image dataset folder structure
NOTE: labels must be numeric values
Assumes structure: path/
/ imgs / imgs Args
output_zip
:Union[str, Path]
- Path of output zipfile
path
:Union[str,Path]
- Path to folder structure
Returns
(Package): Package file holding the given input data.
def from_json(filename: Union[str, pathlib.Path], data: Or(
, )) -> Package -
Create package from JSON file.
Args
filename
:Union[str, Path]
- Filename of the package archive to create
data
:str
- Current location of data to archive
Returns
Package
- The archive object
def from_model(filename: Union[str, pathlib.Path], model_type: str, model_path: Or(
, ), unrestricted_data: List[str] = [], reports: Optional[ModelReportRecord] = None, logs: List[str] = [], data_transformers_path: Union[str, pathlib.Path] = '', target_transformers_path: Union[str, pathlib.Path] = '', vocab_path: Union[str, pathlib.Path] = '', target_map_path: Union[str, pathlib.Path] = '', validation_hash: Optional[str] = None) -> Package -
Create package from model file.
Args
filename
:Union[str, Path]
- Filename of the package archive to create
model_type
:str
- Model format ie torch, keras, and etc.
model_path
:str
- Current location of model to archive
Returns
Package
- The archive object
def from_numpy(output_zip: Union[str, pathlib.Path], X: Union[
, str, pathlib.Path], y: Union[List, , str, pathlib.Path] = None) -> Package -
Prepare a single data file from numpy as an appropriately structured Package
Args
output_zip
:Union[str, Path]
- Path of Package to create
X
:np.array
- Training data
y
:np.array
- Training labels
Returns
(Package): Package file holding the given input data.
def from_single_file(output: Union[str, pathlib.Path], input: Union[str, pathlib.Path], is_masked: bool = True, unmask_columns: Optional[List[str]] = None) -> Tuple[Package, bool]
-
Prepare a single data file as an appropriately structured Package
Args
output
:Union[str, Path]
- Path of Package to create
input
:Union[str, Path]
- Path of data to be placed into the Package
is_masked
:bool
- Whether or not the data is masked.
unmask_columns
:[str]
, optional- List of column names that are unmasked. Default to to mask all columns.
Raises
Exception
- Unable to ascertain the proper Package to hold the data
Returns
(Package, bool): Package file holding the given input data, plus a boolean indicating if it is a package of images.
def from_word_embedding(filename: Union[str, pathlib.Path], embedding_path: Union[str, pathlib.Path], vocab_path: Union[str, pathlib.Path] = '') -> Package
-
Create word embedding package for training
Args
filename
:Union[str, Path]
- Filename of the package archive to create
embedding_path
:Union[str, Path]
- word embedding path
vocab_path
:Union[str, Path]
- path to vocabulary for embedding
Returns
Package
- The archive object
def load(path: Union[str, pathlib.Path], validation_hash: Optional[str] = None) -> Package
-
Instantiates a Package object from a file
Args
path
:Union[str, Path]
- The file, must already be in package format
validation_hash
:Optional[str]
, optional- The expected hash of the
package contents. The router stores the hash at asset registration. Defaults to None. Will bypass check.
Returns
Package
- A Package instance
def reference_from_database(filename: Union[str, pathlib.Path], query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None) -> Package
-
Define a package referring to a database-held dataset.
Args
filename
:Union[str, Path]
- Filename of the package archive to create
query
:str
- The SQL query which will be used to collect the dataset.
connection
:str
- The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
options
:Optional, dict
- Dictionary of database connection options.
credentials_info
:Optional, dict
- Dictionary of credentials information if not provided in the connection string.
Returns
Package
- The archive object
def reference_from_mongo_database(filename: Union[str, pathlib.Path], query: str, connection: str, database: str, collection: str, projection: dict = {}, limit: Optional[int] = None, sort: Optional[List] = None) -> Package
-
Define a package referring to a database-held dataset.
Args
filename
:Union[str, Path]
- Filename of the package archive to create
query
:str
- JSON dictionary which is compatible with pymongo.
connection
:str
- Mongo connection uri. See: [https://docs.mongodb.com/manual/reference/connection-string/]
Returns
Package
- The archive object
Instance variables
Methods
def create_sqlalchemy_engine(self) -> sqlalchemy.engine.base.Engine
-
Create a SQLAlchemy engine for the package's database
Returns
sqlalchemy.engine.Engine
- The engine
def get_data_transforms(self)
def get_model_misclassifications(self) -> List[pandas.core.frame.DataFrame]
-
Get information about failed test cases during model training
If the final model failed to produce correct results for any of the labeled test data, a sample of "unrestricted" information about those failures is returned. The unrestricted data is declared by the dataset owner when they mark a data column as Unmasked.
A maximum of 10 records are returned per client.
Returns
List[pd.Dataframe]
- Dataframes holding the unmasked data for failures, up to one per client
def get_package_type(self) -> PackageType
-
Get the category of the packaged data
Returns
PackageType
- An indication of the content of the package
def get_target_mapping(self)
def get_target_transforms(self)
def get_vocab(self)
def get_word_embedding(self)
def hash_contents(self)
-
Hashes the contents of the Package
For all Package types, hash the concatenated CRC-32 values of files in the package, excluding spec files. For database variant Packages, the database query is stored in metadata and also gets hashed.
Note: The hash is not stored in the Package.
Returns
string
- hexdigest of sha256 hash
def iter_records(self) -> PackageIterator
def model(self)
-
Extract model contained in package to memory
Possible model_types include: * keras * pytorch * sklearn * recommender * onnx: A ModelProto object * xgboost: xgboost.XGBClassifier or xgboost.XGBRegressor * pmml_regression: A privophy.RegressionModel or privophy.GeneralRegressionModel object * pmml_tree * network_builder: JSON describing the TripleBlind model (e.g. split NN, vertical network)
Returns
Or[Pytorch, Keras, SKlearn, XGBoost, Recommender models, PMMLRegression, JSON]
- The model
def model_pointer(self) -> Tuple[str, object]
-
Return model type and file pointer directly to model file path inside of the zip file. model_types include: keras, pytorch, sklearn, recommender, and xgboost
Returns
Tuple[model_type as string, zip file pointer to model path]
def perform_database_report(self, report_values) -> pandas.core.frame.DataFrame
def populate_spec(self, force: bool = False)
def record_data(self) -> pandas.core.frame.DataFrame
def record_data_as_file(self)
def records(self) -> pandas.core.frame.DataFrame
def records_chunked(self, chunksize: int) -> Iterator[pandas.core.frame.DataFrame]
def regenerate_spec(self)
def store_synthesizer(self, synthesizer, synth_type: str, synth_version: Optional[str] = None)
-
Stores the trained synthesizer for the package
Args
synthesizer
- The synthesizer model to store in the package.
synth_type
:str
-
The type of the synthesizer model to store.
Supported synthesizer types include: * CTGAN * LEGACY
synth_version
:str
- The version of the synthesizer model to store.
def synthesizer(self)
-
Returns data synthesizer model in memory
Supported synthesizer types include: * CTGAN
class PackageIterator (parent: Package, zip: zipfile.ZipFile, df: pandas.core.frame.DataFrame)
-
Helper for walking through the contents of a Package file
Ancestors
- collections.abc.Iterator
- collections.abc.Iterable
- typing.Generic
class PackageType (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Ancestors
- enum.Enum
Class variables
var AWS_S3_BUCKET_STORAGE
var AZURE_DATA_LAKE_STORAGE
var CSV
var DATABASE_REPORT
var JSON
var MODEL
var MONGO
var SQL
var WORD_EMBEDDING