Module preprocessor.package
A Package
is a single file used to hold one or more files of data.
The
Package is essentially a .zip archive with several specific files inside it
to define metadata about the package. Each package contains a file named
".meta.json" as well as the data itself.
Functions
def connect_sqlalchemy(engine, max_retries=4)
-
Connect to a SQLAlchemy engine with retries and exponential backoff
Args
engine
:sa.engine.Engine
- The engine to connect to
max_retries
:int
- The maximum number of times to retry the connection
Returns
sa.engine.Connection
- The connection
def generate_mustache_mock_context(template, params: List[ReportParameter])
-
For validating a user-provided sql/elasticsearch template, use their param definiton to generate a mock mustache context. This can be used with chevron to attempt to populate it and warn user of any errors (happens outside this function).
def validate_report_template_params(query_template: str, params: Union[List[ReportParameter], Dict[str, ReportParameter]], verbose=False)
-
Validate a user-provided sql/elasticsearch template, using their param definiton to generate a mock mustache context. Attempt to populate it with chevron.
Returns
query_template rendered with mock context
Classes
class Meta (version: MetaVersion, last_modified: datetime.datetime, record: Union[MetaRecordCsv, MetaRecordSql, MetaRecordMongo, MetaRecordModel, MetaRecordJson, MetaRecordWordEmbedding, MetaRecordAWSs3BucketStorage, MetaRecordAzureDataLakeStorage, MetaRecordDatabaseReport, MetaRecordReportResult, MetaRecordModelFolder, MetaRecordElasticsearch, MetaRecordElasticsearchReport, MetaRecordGenericResult])
-
Meta(version: preprocessor.package.MetaVersion, last_modified: datetime.datetime, record: Union[preprocessor.package.MetaRecordCsv, preprocessor.package.MetaRecordSql, preprocessor.package.MetaRecordMongo, preprocessor.package.MetaRecordModel, preprocessor.package.MetaRecordJson, preprocessor.package.MetaRecordWordEmbedding, preprocessor.package.MetaRecordAWSs3BucketStorage, preprocessor.package.MetaRecordAzureDataLakeStorage, preprocessor.package.MetaRecordDatabaseReport, preprocessor.package.MetaRecordReportResult, preprocessor.package.MetaRecordModelFolder, preprocessor.package.MetaRecordElasticsearch, preprocessor.package.MetaRecordElasticsearchReport, preprocessor.package.MetaRecordGenericResult])
Ancestors
- IsDict
- abc.ABC
Class variables
var SCHEMA_V1
var SCHEMA_V2
var SCHEMA_VERSION
var last_modified : datetime.datetime
var record : Union[MetaRecordCsv, MetaRecordSql, MetaRecordMongo, MetaRecordModel, MetaRecordJson, MetaRecordWordEmbedding, MetaRecordAWSs3BucketStorage, MetaRecordAzureDataLakeStorage, MetaRecordDatabaseReport, MetaRecordReportResult, MetaRecordModelFolder, MetaRecordElasticsearch, MetaRecordElasticsearchReport, MetaRecordGenericResult]
var version : MetaVersion
Static methods
def from_record(valid: dict) -> Union[MetaRecordCsv, MetaRecordSql, MetaRecordMongo, MetaRecordModel, MetaRecordJson, MetaRecordWordEmbedding, MetaRecordAWSs3BucketStorage, MetaRecordAzureDataLakeStorage, MetaRecordDatabaseReport, MetaRecordReportResult, MetaRecordModelFolder, MetaRecordElasticsearch, MetaRecordElasticsearchReport, MetaRecordGenericResult]
def promote_v2(valid: dict) -> dict
def validate(input: dict) -> dict
Inherited members
class MetaRecordAWSs3BucketStorage (bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str)
-
MetaRecordAWSs3BucketStorage(bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str)
Class variables
var aws_access_key_id : str
var aws_secret_access_key : str
var bucket_name : str
var object_name : str
var region : str
Static methods
def from_dict(input: dict) -> MetaRecordAWSs3BucketStorage
Methods
def to_dict(self) -> dict
class MetaRecordAzureDataLakeStorage (storage_account_name: str, storage_key: str, file_system: str, path: str)
-
MetaRecordAzureDataLakeStorage(storage_account_name: str, storage_key: str, file_system: str, path: str)
Class variables
var file_system : str
var path : str
var storage_account_name : str
var storage_key : str
Static methods
def from_dict(input: dict) -> MetaRecordAzureDataLakeStorage
Methods
def get_file_client(self) -> azure.storage.filedatalake._data_lake_file_client.DataLakeFileClient
def to_dict(self) -> dict
class MetaRecordCsv (sheet_path: str, path_column: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
-
MetaRecordCsv(sheet_path: str, path_column: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
Class variables
var path_column : str
var sheet_path : str
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordCsv
Methods
def to_dict(self) -> dict
class MetaRecordDatabaseReport (query_template: str, params: Dict[str, ReportParameter], connection: Optional[str] = None, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None, federation_group: Optional[str] = None, aggregation_template: Optional[dict] = None, post_processing_script: Optional[str] = None, json_to_dataframe_script: Optional[str] = None, name: Optional[str] = None, description: Optional[str] = None)
-
MetaRecordDatabaseReport(query_template: str, params: Dict[str, preprocessor.report_parameters.ReportParameter], connection: Optional[str] = None, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None, federation_group: Optional[str] = None, aggregation_template: Optional[dict] = None, post_processing_script: Optional[str] = None, json_to_dataframe_script: Optional[str] = None, name: Optional[str] = None, description: Optional[str] = None)
Class variables
var aggregation_template : Optional[dict]
var connection : Optional[str]
var connection_opts : Optional[dict]
var credentials_info : Optional[dict]
var description : Optional[str]
var federation_group : Optional[str]
var json_to_dataframe_script : Optional[str]
var name : Optional[str]
var params : Dict[str, ReportParameter]
var post_processing_script : Optional[str]
var query_template : str
Static methods
def from_dict(input: dict) -> MetaRecordDatabaseReport
Methods
def to_dict(self) -> dict
class MetaRecordElasticsearch (body: dict, connection: str, index: str, api_key: str, return_type: str = 'agg_json', store_type: Optional[str] = None)
-
MetaRecordElasticsearch(body: dict, connection: str, index: str, api_key: str, return_type: str = 'agg_json', store_type: Optional[str] = None)
Class variables
var api_key : str
var body : dict
var connection : str
var index : str
var return_type : str
var store_type : Optional[str]
Static methods
def from_dict(in_dict: dict) -> MetaRecordElasticsearch
Methods
def to_dict(self) -> dict
class MetaRecordElasticsearchReport (name: str, body_template: str, params: Dict[str, ReportParameter], connection: str, index: str, api_key: str, description: Optional[str] = None, post_processing_script: Optional[str] = (None,))
-
MetaRecordElasticsearchReport(name: str, body_template: str, params: Dict[str, preprocessor.report_parameters.ReportParameter], connection: str, index: str, api_key: str, description: Optional[str] = None, post_processing_script: Optional[str] = (None,))
Class variables
var api_key : str
var body_template : str
var connection : str
var description : Optional[str]
var index : str
var name : str
var params : Dict[str, ReportParameter]
var post_processing_script : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordElasticsearchReport
Methods
def to_dict(self) -> dict
class MetaRecordGenericResult (files: Union[List[str], List[pathlib.Path]], job_id: str, job_type: str)
-
MetaRecordGenericResult(files: Union[List[str], List[pathlib.Path]], job_id: str, job_type: str)
Class variables
var files : Union[List[str], List[pathlib.Path]]
var job_id : str
var job_type : str
Static methods
def from_dict(in_dict: dict) -> MetaRecordGenericResult
Methods
def to_dict(self) -> dict
class MetaRecordJson (data: str)
-
MetaRecordJson(data: str)
Class variables
var data : str
Static methods
def from_dict(input: dict) -> MetaRecordJson
Methods
def to_dict(self) -> dict
class MetaRecordModel (model_path: str, model_type: str, unrestricted_data: list, reports: ModelReportRecord, logs: list, data_transformers_path: List = None, target_transformers_path: List = None, vocab_path: str = None, target_map_path: str = None)
-
MetaRecordModel(model_path: str, model_type: str, unrestricted_data: list, reports: preprocessor.package.ModelReportRecord, logs: list, data_transformers_path: List = None, target_transformers_path: List = None, vocab_path: str = None, target_map_path: str = None)
Class variables
var data_transformers_path : List
var logs : list
var model_path : str
var model_type : str
var reports : ModelReportRecord
var target_map_path : str
var target_transformers_path : List
var unrestricted_data : list
var vocab_path : str
Static methods
def from_dict(input: dict) -> MetaRecordModel
Methods
def to_dict(self) -> dict
class MetaRecordModelFolder (manifest: dict = None)
-
MetaRecordModelFolder(manifest: dict = None)
Class variables
var manifest : dict
Methods
def from_dict(input: dict) -> MetaRecordModelFolder
def to_dict(self) -> dict
class MetaRecordMongo (query: str, connection: str, database: str, collection: str, projection: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None, limit: Optional[int] = None, sort: Optional[List] = None)
-
MetaRecordMongo(query: str, connection: str, database: str, collection: str, projection: str, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None, limit: Optional[int] = None, sort: Optional[List] = None)
Class variables
var collection : str
var connection : str
var database : str
var limit : Optional[int]
var projection : str
var query : str
var sort : Optional[List]
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordMongo
Methods
def to_dict(self) -> dict
class MetaRecordReportResult (manifest: dict = None)
-
MetaRecordReportResult(manifest: dict = None)
Class variables
var manifest : dict
Methods
def from_dict(input: dict) -> MetaRecordReportResult
def to_dict(self) -> dict
class MetaRecordSql (query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
-
MetaRecordSql(query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None, synthesizer_path: Optional[str] = None, synthesizer_type: Optional[str] = None)
Class variables
var connection : str
var credentials_info : Optional[dict]
var options : Optional[dict]
var query : str
var synthesizer_path : Optional[str]
var synthesizer_type : Optional[str]
Static methods
def from_dict(input: dict) -> MetaRecordSql
Methods
def to_dict(self) -> dict
class MetaRecordWordEmbedding (vocab_path: str, embedding_path: str)
-
MetaRecordWordEmbedding(vocab_path: str, embedding_path: str)
Class variables
var embedding_path : str
var vocab_path : str
Static methods
def from_dict(input: dict) -> MetaRecordWordEmbedding
Methods
def to_dict(self) -> dict
class MetaVersion (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Ancestors
- enum.Enum
Class variables
var V1
var V2
class ModelReportRecord (input_shape: List[Union[List[int], int]] = None, output_shape: List[int] = None, model_summary: List[str] = None, library_version: str = None)
-
ModelReportRecord(input_shape: List[Union[List[int], int]] = None, output_shape: List[int] = None, model_summary: List[str] = None, library_version: str = None)
Class variables
var input_shape : List[Union[List[int], int]]
var library_version : str
var model_summary : List[str]
var output_shape : List[int]
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) -> ~A
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) -> ~A
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) -> dataclasses_json.mm.SchemaF[~A]
Methods
def to_dict(self, encode_json=False) -> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) -> str
class Package (path: pathlib.Path, meta: Meta, spec: Optional[Spec])
-
A collection of data for training or computations, along with descriptions of the contents. A Package is essentially an archive (.zip) of files following a special internal structure:
An example Package with simple tabular data would internally look like: filename.zip .meta.json # describes version, creation date, etc some_kind_of_data.csv # the data in this package
Image Package files also contain an internal "records.csv" which associates information such as training labels with images within the package. An example Image Package file would internally look like: filename.zip .meta.json # describes version, creation date, etc records.csv # index of images and labels (for training) images img_001.jpg img_002.jpg
Packages can also contain info to authenticate and query a database, etc.
Class variables
var MANIFEST_FILE
var META_FILE
var SPEC_FILE
Static methods
def create(filename: Union[str, pathlib.Path], record_data: Union[str, pathlib.Path], root: Union[str, pathlib.Path] = None, path_column: Optional[str] = None, label_column: Optional[str] = None, header: Optional[List[str]] = None, spec_override: List[FieldOverride] = [], is_masked: bool = True, unmask_columns: Optional[List[str]] = None, supplemental_named_paths: Optional[Dict[str, str]] = None) -> Package
-
Create a Package using a simple CSV or a CSV describing a folder layout
For the simple case, just define the
record_data
(the CSV file) and optionally theheader
list if the first row of the CSV does not hold the name of the columns.For the more complex case of a folder layout, the CSV is a list of files and must contain several specific columns. * path_column (required): Name of the column holding the associated data file path/filenames * root (optional): The root from which the above paths are relative. If None, paths are relative to the CSV itself. * label_column (optional): Name of the column holding a label describing each file. If there are multiple labels per file, use JSON format to specify the list.
Args
filename
:Union[str, Path]
- Filename of the Package to create
record_data
:Union[str, Path]
- Filename of data used to populate this Package.
root
:Union[str, Path]
, optional- Path to the root of the data folder. Default is None
path_column
:str
, optional- Name of the column in the record_data file which contains paths to data files. If None, the record_data is treated as a simple tabular data file.
label_column
:str
, optional- Name of the label column. When a path_column exists, this column holds labels associated with the file in the path_column. Multi-label datasets need to be in JSON format.
header
:List[str]
, optional- A list of column names. If None, the first row of the CSV will be used as a header.
- spec_override (List[FieldOverride]) = [],
is_masked
:bool
- Whether or not the data is masked.
unmask_columns
:[str]
, optional- List of individual fields to unmask.
supplemental_named_paths
:[Dict[str,str]]
, optional- This is a dictionary of name:path indicating files to be included in the package
Returns
Package
- The archive object
def create_database_report(filename: Union[str, pathlib.Path], query_template: str, params: Dict[str, ReportParameter], connection: Optional[str] = None, connection_opts: Optional[dict] = None, credentials_info: Optional[dict] = None, federation_group: Union[str, uuid.UUID, ForwardRef(None)] = None, aggregation_template: Optional[dict] = None, post_processing_script: Optional[str] = None, json_to_dataframe_script: Optional[str] = None, name: Optional[str] = None, description: Optional[str] = None) -> Package
-
Validate and create a database report Package.
Args
filename
:Union[str, Path]
- Filename of the package to be created.
query_template
:str
- The SQL/Elastic query template containing Mustache template parameters for the report.
params
:Dict[str, ReportParameter]
- Parameters for this report.
connection
:str
- The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
connection_opts
:dict
, optional- Dictionary of database connection options.
credentials_info
:dict
, optional- Dictionary of credentials information if not provided in the connection string.
federation_group
:Union[str, UUID, None]
, optional- The federation group to use when running this report. If present, overrides connection.
aggregation_template
:str
, optional- The result aggregation to use when running a federated report.
post_processing_script
:str, Optional
- A Python function or the filename containing a function to run after the report has been executed. The function must have the signature: def postprocess(df: pd.Dataframe, ctx: dict) The two arguments are the report output data frame and a dict holding the user-selected report parameters as context.
json_to_dataframe_script
:str, Optional
- A Python function or the filename containing a function to convert the JSON output of individual federation members to a pandas dataframe.
name
:str
, optional- The name of the report.
description
:str
, optional- The description of the report.
Raises
Exception
- Query template must not be blank.
Exception
- Report parameters are required.
Exception
- Invalid param_type for ReportParameter …
Exception
- PARAM does not appear in the query template.
Exception
- PARAM missing from
params
Exception
- Name must be unique for each parameter, PARAM reused.
Returns
Package
- The created report package object
def create_elastic_search_query(filename: Union[str, pathlib.Path], connection: str, api_key: str, index: str, body: dict, return_type: str, store_type: str)
def create_elastic_search_report(filename: Union[str, pathlib.Path], name: str, description: str, post_processing_script: str, body_template: str, params: dict, connection: str, index: str, api_key: str) -> Package
def create_from_database(filename: Union[str, pathlib.Path], query: str, connection: str) -> Package
-
Define a package extracted from a database-held dataset.
Args
filename
:Union[str, Path]
- Filename of the package to be created.
query
:str
- The SQL query used to collect the dataset.
connection
:str
- The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
Returns
Package
- The archive object
def from_aws_s3_bucket_storage(filename: Union[str, pathlib.Path], bucket_name: str, region: str, object_name: str, aws_access_key_id: str, aws_secret_access_key: str) -> Package
-
Create a package file referencing an AWS S3 Bucket Storage data file
Args
filename
:Union[str, Path]
- Filename of the package to be created.
bucket_name
:str
- Name of the AWS S3 Bucket containing the data file
region
:str
- The AWS region
object_name
:str
- The file name, know as object or key in AWS S3
aws_access_key_id
:str
- Access key for this account, region, bucket
aws_secret_access_key
:str
- Secret access key for this account, region, bucket
Returns
Package
- The created Package object
def from_azure_blob_storage(filename: Union[str, pathlib.Path], storage_account_name: str, storage_key: str, file_system: str, key: str) -> Package
-
Create a package file referencing a Azure Blob Storage data file
Args
filename
:Union[str, Path]
- Filename of the package to be created.
storage_account_name
:str
- The Azure storage account to reference.
storage_key
:str
- Access token used when pulling files from the storage account.
file_system
:str
- File system defined in the Azure control panel for the storage account.
path
:str
- The full path to the file that will be downloaded.
Returns
Package
- The created Package object
def from_azure_data_lake_storage(filename: Union[str, pathlib.Path], storage_account_name: str, storage_key: str, file_system: str, path: str) -> Package
-
Create a package file referencing a Azure Data Lake Storage data file
Args
filename
:Union[str, Path]
- Filename of the package to be created.
storage_account_name
:str
- The Azure storage account to reference.
storage_key
:str
- Access token used when pulling files from the storage account.
file_system
:str
- File system defined in the Azure control panel for the storage account.
path
:str
- The full path to the file that will be downloaded.
Returns
Package
- The created Package object
def from_generic_result(package_name: Union[str, pathlib.Path], file_data_dict: dict, job_id: str, job_type: str) -> Package
def from_image_dataset_folder(output_zip: Union[str, pathlib.Path], path: Union[str, pathlib.Path]) -> Package
-
Create package from torch style image dataset folder structure
NOTE: labels must be numeric values
Assumes structure: path/
/ imgs / imgs Args
output_zip
:Union[str, Path]
- Path of output zipfile
path
:Union[str,Path]
- Path to folder structure
Returns
(Package): Package file holding the given input data.
def from_json(filename: Union[str, pathlib.Path], data: Or(
, )) -> Package -
Create package from JSON file.
Args
filename
:Union[str, Path]
- Filename of the package to be created.
data
:str
- Filename of the data to package
Returns
Package
- The created Package object
def from_model(filename: Union[str, pathlib.Path], model_type: str, model_path: Or(
, ), unrestricted_data: List[str] = [], reports: Optional[ModelReportRecord] = None, logs: List[str] = [], data_transformers_path: Union[str, pathlib.Path] = '', target_transformers_path: Union[str, pathlib.Path] = '', vocab_path: Union[str, pathlib.Path] = '', target_map_path: Union[str, pathlib.Path] = '', validation_hash: Optional[str] = None) -> Package -
Create package from model file.
Args
filename
:Union[str, Path]
- Filename of the package to create
model_type
:str
- Model format, i.e. "torch", "keras", etc.
model_path
:str
- Current location of model to archive
Returns
Package
- The created Package object
def from_model_folder(filename: Union[str, pathlib.Path], model_path: Union[str, pathlib.Path], manifest: Dict) -> Package
-
Create package from model directory.
Args
filename
:Union[str, Path]
- Filename of the package to be created.
model_path
:Union[str, Path]
- Path to the model directory
manifest
:Dict
- The manifest of the model
Returns
Package
- The created Package object
def from_numpy(output_zip: Union[str, pathlib.Path], X: Union[
, str, pathlib.Path], y: Union[List, , str, pathlib.Path] = None) -> Package -
Prepare a single data file from numpy as an appropriately structured Package
Args
output_zip
:Union[str, Path]
- Path of Package to create
X
:np.array
- Training data
y
:np.array
- Training labels
Returns
(Package): Package file holding the given input data.
def from_report_result(filename: Union[str, pathlib.Path], result: pandas.core.frame.DataFrame, manifest: Dict) -> Package
-
Create a package file referencing a report result
Args
filename
:Union[str, Path]
- Filename of the package to be created.
result
:pd.DataFrame
- The result of the report to be packaged.
manifest
:Dict
- The manifest of the report result
Returns
Package
- The created Package object
def from_single_file(output: Union[str, pathlib.Path], input: Union[str, pathlib.Path], is_masked: bool = True, unmask_columns: Optional[List[str]] = None) -> Tuple[Package, bool]
-
Prepare a single data file as an appropriately structured Package
Args
output
:Union[str, Path]
- Path of Package to create
input
:Union[str, Path]
- Path of data to be placed into the Package
is_masked
:bool
- Whether or not the data is masked.
unmask_columns
:[str]
, optional- List of column names that are unmasked. Default to to mask all columns.
Raises
Exception
- Unable to ascertain the proper Package to hold the data
Returns
(Package, bool): Package file holding the given input data, plus a boolean indicating if it is a package of images.
def from_word_embedding(filename: Union[str, pathlib.Path], embedding_path: Union[str, pathlib.Path], vocab_path: Union[str, pathlib.Path] = '') -> Package
-
Create word embedding package for training
Args
filename
:Union[str, Path]
- Filename of the package to be created
embedding_path
:Union[str, Path]
- Path to the source word embedding
vocab_path
:Union[str, Path]
- Path to the source vocabulary for embedding
Returns
Package
- The created Package object
def load(path: Union[str, pathlib.Path], validation_hash: Optional[str] = None) -> Package
-
Instantiates a Package object from a file
Args
path
:Union[str, Path]
- The file, must already be in package format
validation_hash
:Optional[str]
, optional- The expected hash of the
package contents. The router stores the hash at asset registration. Defaults to None. Will bypass check.
Returns
Package
- A Package instance
def reference_from_database(filename: Union[str, pathlib.Path], query: str, connection: str, options: Optional[dict] = None, credentials_info: Optional[dict] = None) -> Package
-
Define a package referring to a database-held dataset.
Args
filename
:Union[str, Path]
- Filename of the package to be created.
query
:str
- The SQL query which will be used to collect the dataset.
connection
:str
- The SQLAlchemy compliant connection string that defines where the database resides, as well as how to authenticate with it. See: [https://docs.sqlalchemy.org/core/connections.html]
options
:Optional, dict
- Dictionary of database connection options.
credentials_info
:Optional, dict
- Dictionary of credentials information if not provided in the connection string.
Returns
Package
- The archive object
def reference_from_mongo_database(filename: Union[str, pathlib.Path], query: str, connection: str, database: str, collection: str, projection: dict = {}, limit: Optional[int] = None, sort: Optional[List] = None) -> Package
-
Define a package referring to a database-held dataset.
Args
filename
:Union[str, Path]
- Filename of the package to be created.
query
:str
- JSON dictionary which is compatible with pymongo.
connection
:str
- Mongo connection uri. See: [https://docs.mongodb.com/manual/reference/connection-string/]
Returns
Package
- The archive object
Instance variables
Methods
def create_sqlalchemy_engine(self) -> sqlalchemy.engine.base.Engine
-
Create a SQLAlchemy engine for the package's database
Returns
sqlalchemy.engine.Engine
- The engine
def get_data_transforms(self)
def get_manifest_html(self)
def get_model_misclassifications(self) -> List[pandas.core.frame.DataFrame]
-
Get information about failed test cases during model training
If the final model failed to produce correct results for any of the labeled test data, a sample of "unrestricted" information about those failures is returned. The unrestricted data is declared by the dataset owner when they mark a data column as Unmasked.
A maximum of 10 records are returned per client.
Returns
List[pd.Dataframe]
- Dataframes holding the unmasked data for failures, up to one per client
def get_package_type(self) -> PackageType
-
Get the category of the packaged data
Returns
PackageType
- An indication of the content of the package
def get_target_mapping(self)
def get_target_transforms(self)
def get_vocab(self)
def get_word_embedding(self)
def hash_contents(self)
-
Hashes the contents of the Package
For all Package types, hash the concatenated CRC-32 values of files in the package, excluding spec files. For database variant Packages, the database query is stored in metadata and also gets hashed.
Note: The hash is not stored in the Package.
Returns
string
- hexdigest of sha256 hash
def iter_records(self) -> PackageIterator
def model(self)
-
Extract model contained in package to memory
Possible model_types include: * keras * pytorch * sklearn * recommender * onnx: A ModelProto object * xgboost: xgboost.XGBClassifier or xgboost.XGBRegressor * pmml_regression: A privophy.RegressionModel or privophy.GeneralRegressionModel object * pmml_tree * network_builder: JSON describing the TripleBlind model (e.g. split NN, vertical network)
Returns
Or[Pytorch, Keras, SKlearn, XGBoost, Recommender models, PMMLRegression, JSON]
- The model
def model_pointer(self) -> Tuple[str, object]
-
Return model type and file pointer directly to model file path inside of the zip file. model_types include: keras, pytorch, sklearn, recommender, and xgboost
Returns
Tuple[model_type as string, zip file pointer to model path]
def perform_database_report(self, report_values) -> pandas.core.frame.DataFrame
def perform_elastic_search_report(self, report_values)
def populate_report_template(self, report_values) -> str
def populate_spec(self, force: bool = False)
def record_data(self) -> pandas.core.frame.DataFrame
def record_data_as_file(self)
def records(self) -> pandas.core.frame.DataFrame
def records_chunked(self, chunksize: int) -> Iterator[pandas.core.frame.DataFrame]
def regenerate_spec(self)
async def substitute_connection_secrets(self, secret_store)
-
Use the Access Point provided secret store to replace handlebar variables in connection strings.
def substitute_connection_secrets_sync(self, secret_store)
def validate_db_connection(self)
def validate_sql(self)
-
Run an SQL linter on the query to validate syntax.
Raises
ValueError
- Failed mustache rendering.
ValueError
- Failed SQLFluff linter. Content is a list of error strings in the format: ["Line {line_no}, Position {column_no}: {error message}", …]
class PackageIterator (parent: Package, zip: zipfile.ZipFile, df: pandas.core.frame.DataFrame)
-
Helper for walking through the contents of a Package file
Ancestors
- collections.abc.Iterator
- collections.abc.Iterable
- typing.Generic
class PackageType (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Ancestors
- enum.Enum
Class variables
var AWS_S3_BUCKET_STORAGE
var AZURE_DATA_LAKE_STORAGE
var CSV
var DATABASE_REPORT
var ELASTIC_SEARCH
var ELASTIC_SEARCH_REPORT
var GENERIC_RESULT
var JSON
var MODEL
var MODEL_FOLDER
var MONGO
var REPORT_RESULT
var SQL
var WORD_EMBEDDING