diff --git a/doc/data_access.md b/doc/data_access.md new file mode 100644 index 000000000..3e12f67df --- /dev/null +++ b/doc/data_access.md @@ -0,0 +1,122 @@ +# FATE Data Access Guide + +## 1. Upload Process +The process diagram for data upload is as follows: + +![Data Upload](./images/upload_data.png) +- The client uploads data to the server. +- The server encapsulates the upload parameters into a DAG job configuration, including two components: 'upload' and 'dataframe-transformer,' then calls the submit interface to submit the job. +- The 'upload' component stores data into the FATE storage service. +- The 'transformer' component converts the data output from the 'upload' component into a dataframe and stores it into the FATE storage service. +- Metadata about the data is stored in the database. + +## 2. Data Upload Methods +Note: FATE provides clients including SDK, CLI, and Pipeline. If you haven't deployed the FATE Client in your environment, you can use `pip install fate_client` to download it. The following operations are CLI-based. + +### 2.1 Upload Scenario Explanation +- Client-server separation: Installed client and server are on different machines. +- Client-server non-separation: Installed client and server are on the same machine. +Difference: In scenarios where the client and server are not separated, the step "the client uploads data to the server" in the above process can be omitted to improve data upload efficiency in scenarios with large data volumes. There are differences in interfaces and parameters between the two scenarios, and you can choose the corresponding scenario for data upload. + +### 2.2 Data Upload +#### 2.2.1 Configuration and Data Preparation + - Upload configuration is located in [examples-upload](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0/examples/upload) + ```yaml + { + "file": "examples/data/breast_hetero_guest.csv", + "head": true, + "partitions": 16, + "extend_sid": true, + "meta": { + "delimiter": ",", + "label_name": "y", + "match_id_name": "id" + }, + "namespace": "experiment", + "name": "breast_hetero_guest" + } + ``` + - file: File path + - head: Whether the data contains a header: true/false + - partitions: Number of data storage partitions + - extend_sid: Whether to generate an 'sid' column + - meta: Metadata about the data + - namespace && name: Reference to data in the FATE storage table + - Uploaded data is located in [upload-data](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0/examples/data) + - You can also use your own data and modify the "meta" information in the upload configuration. + +#### 2.2.2 Data Upload Commands +##### Client-Server Non-Separation +```shell +flow data upload -c examples/upload/upload_guest.json +``` +Note: Ensure that the file path in the configuration exists on the server. +##### Client-Server Separation +```shell +flow data upload-file -c examples/upload/upload_guest.json +``` +#### 2.2.3 Upload Results +```json +{ + "code": 0, + "data": { + "name": "breast_hetero_guest", + "namespace": "experiment" + }, + "job_id": "202312281606030428210", + "message": "success" +} +``` + +#### 2.2.4 Data Query +Since the entire upload is an asynchronous operation, it's necessary to confirm successful upload before performing subsequent operations. +```shell +flow table query --namespace experiment --name breast_hetero_guest +``` +- Successful data upload returns: +```json +{ + "code": 0, + "data": { + "count": 569, + "data_type": "dataframe", + "engine": "standalone", + "meta": {}, + "name": "breast_hetero_guest", + "namespace": "experiment", + "path": "xxx", + "source": { + "component": "dataframe_transformer", + "output_artifact_key": "dataframe_output", + "output_index": null, + "party_task_id": "202312281606030428210_transformer_0_0_local_0", + "task_id": "202312281606030428210_transformer_0", + "task_name": "transformer_0" + } + }, + "message": "success" +} +``` + +## 3. Data Binding +For specific algorithms that may require particular datasets, FATE Flow provides a data binding interface to make the data available for use in FATE. + +```shell +flow table bind --namespace bind_data --name breast_hetero_guest --path /data/projects/fate/fate_flow/data/xxx +``` + +## 4. Data Query +For uploaded or bound data tables, you can use the query interface to retrieve brief information about the data. + +```shell +flow table query --namespace experiment --name breast_hetero_guest +``` + +## 5. Data Cleaning +You can use delete cli to clean data tables that already exist in FATE. + +```shell +flow table delete --namespace experiment --name breast_hetero_guest +``` + +This covers the translation of the document into English. \ No newline at end of file diff --git a/doc/data_access.zh.md b/doc/data_access.zh.md new file mode 100644 index 000000000..83194d96f --- /dev/null +++ b/doc/data_access.zh.md @@ -0,0 +1,121 @@ +# FATE数据接入指南 +## 1. 上传流程 +数据上传的流程图如下: + +![数据上传](./images/upload_data.png) +- 客户端将数据上传到服务端; +- 服务端将上传参数封装成DAG作业配置, 配置中包含两个组件, 即upload和dataframe-transformer,并调用submit接口提交作业; +- upload组件将数据存储到fate存储服务中; +- transformer组件将upload组件的数据输出转化成dataframe并存储到fate存储服务中; +- 数据的meta信息存储到DB中. + +## 2. 数据上传方式 +注: fate提供的客户端包括SDK、CLI、Pipeline,若你的环境中没有部署FATE Client,可以使用`pip install fate_client`下载,以下的使用操作均基于cli编写。 +### 2.1 上传场景说明 +- 客户端、服务器分离:安装的客户端和服务器不在一台机器 +- 客户端、服务器不分离:安装的客户端和服务器在同一台机器 +两者区别:客户端不分离的场景,可以去掉上述流程中"客户端将数据上传到服务端",以此提高大数据量场景下数据上传的效率。两种场景接口、参数有区别,可以选择对应的场景进行数据上传。 + +### 2.2 数据上传 +#### 2.2.1 配置及数据准备 + - 上传配置位于[examples-upload](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0/examples/upload) + ```yaml + { + "file": "examples/data/breast_hetero_guest.csv", + "head": true, + "partitions": 16, + "extend_sid": true, + "meta": { + "delimiter": ",", + "label_name": "y", + "match_id_name": "id" + }, + "namespace": "experiment", + "name": "breast_hetero_guest" + } + ``` + - file: 文件路径 + - head: 数据是否携带header: true/false + - partitions: 数据存储分区数量 + - extend_sid:是否需要生成sid列 + - meta:数据的元信息 + - namespace && name: 数据在fate的存储表引用 + - 上传数据位于[upload-data](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0/examples/data) + - 你也可以使用自己的数据,并修改upload配置中的"meta"信息 + +#### 2.2.2 上传数据命令 +##### 客户端-服务器不分离 +```shell +flow data upload -c examples/upload/upload_guest.json +``` +注:需要保证配置中的file路径在服务器中存在。 +##### 客户端-服务器分离 +```shell +flow data upload-file -c examples/upload/upload_guest.json +``` +#### 2.2.3 上传结果 +```json +{ + "code": 0, + "data": { + "name": "breast_hetero_guest", + "namespace": "experiment" + }, + "job_id": "202312281606030428210", + "message": "success" +} + +``` + +#### 2.2.4 数据查询 +因为整个上传为异步操作,需要确认是否上传成功才可进行后续操作。 +```shell +flow table query --namespace experiment --name breast_hetero_guest +``` +- 数据上传成功返回 +```json +{ + "code": 0, + "data": { + "count": 569, + "data_type": "dataframe", + "engine": "standalone", + "meta": {}, + "name": "breast_hetero_guest", + "namespace": "experiment", + "path": "xxx", + "source": { + "component": "dataframe_transformer", + "output_artifact_key": "dataframe_output", + "output_index": null, + "party_task_id": "202312281606030428210_transformer_0_0_local_0", + "task_id": "202312281606030428210_transformer_0", + "task_name": "transformer_0" + } + }, + "message": "success" +} + +``` + +## 3. 数据绑定 +对于特定的算法,可能需要特殊的数据集,FATE Flow提供data bind接口来将数据供FATE使用 + +```shell +flow table bind --namespace bind_data --name breast_hetero_guest --path /data/projects/fate/fate_flow/data/xxx +``` + +## 4. 数据查询 +对于上传或者绑定的数据表,可以通过查询接口来获取数据的简略信息 + +```shell +flow table query --namespace experiment --name breast_hetero_guest +``` + +## 5. 数据清理 +可以通过清理接口来清理已经存在FATE的数据表 + +```shell +flow table delete --namespace experiment --name breast_hetero_guest +``` + diff --git a/doc/fate_flow.md b/doc/fate_flow.md index 9360d44db..7559fb673 100644 --- a/doc/fate_flow.md +++ b/doc/fate_flow.md @@ -1,110 +1,80 @@ # Overall Design -## 1. Logical Architecture +## 1. Design Architecture Diagram +![](./images/open_flow.png) +- Application Layer Interface: Used by higher-level components like fate-board, fate-client, etc. +- Interconnect Layer Interface: Divided into Scheduler Interface and Participant Interface. Scheduler Interface receives scheduling commands like create, stop, etc., and sends them to participants. Participant Interface is used by each participant to receive commands like create, run, stop, etc., and execute them. +- Base Interface: Receives status reports from algorithm containers, etc. +- Scheduler: Federated scheduling logic, interprets DSL dependencies, and runs related jobs and tasks. +- Algorithm Container: Environment for algorithm execution. FATE Flow supports running algorithms in local processes or in algorithm containers, with similar execution modes. +- Platform Resource Pool: Abstract computation, communication, storage APIs. -- DSL defined jobs -- Top-down vertical subtask flow scheduling, multi-participant joint subtask coordination -- Independent isolated task execution work processes -- Support for multiple types and versions of components -- Computational abstraction API -- Storage abstraction API -- Cross-party transfer abstraction API -![](./images/fate_flow_logical_arch.png) +## 2. Overall Architecture -## 2. Service Architecture - -### 2.1 FATE +### 2.1 FATE Overall Architecture ![](./images/fate_arch.png) -### 2.2 FATE Flow +### 2.2 FATE Flow Functional Architecture ![](./images/fate_flow_arch.png) -## 3. [Scheduling Architecture](./fate_flow_job_scheduling.md) +### 2.3 FATE Flow Cluster Architecture + +![](./images/flow_cluster.png) -### 3.1 A new scheduling architecture based on shared-state +## 3. Scheduling Architecture +### 3.1 State-Based Scheduling Architecture -- Stripping state (resources, jobs) and managers (schedulers, resource managers) -- Resource state and job state are persisted in MySQL and shared globally to provide reliable transactional operations -- Improve the high availability and scalability of managed services -- Jobs can be intervened to support restart, rerun, parallel control, resource isolation, etc. +- Separation of states (resources, jobs) and managers (scheduler, resource manager) +- Persistent storage of resource and job states in MySQL, globally shared, providing reliable transactional operations +- Improved high availability and scalability of management services +- Intervention in jobs, supporting actions like restarts, reruns, parallel control, resource isolation, etc. ![](./images/fate_flow_scheduling_arch.png) ### 3.2 State-Driven Scheduling -- Resource coordination -- Pull up the child process Executor to run the component -- Executor reports state to local Server and also to scheduler -- Multi-party task state calculation of federal task state -- Upstream and downstream task states compute job states +- North-south state reporting/querying +- East-west multi-party task state computation for federated task states +- Upstream and downstream task state computation for job states +#### 3.2.1 Callback Mode +Scheduler creates jobs and tasks, and each participant actively callbacks the state of jobs or tasks. + +![](./images/schedule_for_callback.png) +#### 3.2.2 Polling Mode +Scheduler not only creates jobs and tasks but also polls the state of jobs or tasks from the participants during the scheduling process. -![](./images/fate_flow_resource_process.png) +![](./images/schedule_for_poll.png) -## 4. [Multiparty Resource Coordination](./fate_flow_resource_management.md) +### 3.4 Algorithm Component Scheduling +- Pre-processing: Handling inputs such as data, models, algorithm parameters +- Component execution: Logic of algorithm components +- Post-processing: Handling outputs of algorithm components -- The total resource size of each engine is configured through the configuration file, and the system is subsequently interfaced -- The cores_per_node in the total resource size indicates the number of cpu cores per compute node, and nodes indicates the number of compute nodes. -- FATEFlow server reads the resource size configuration from the configuration file when it starts and registers the update to the database -- The resources are requested in Job dimension, and take effect when Job Conf is submitted, formula: task_parallelism*task_cores -- See separate section of the documentation for details +![](./images/schedule_for_component.png) -## 5. [Data Flow Tracking](./fate_flow_tracking.md) +## 4. Multi-Party Resource Coordination -- Definition - - metric type: metric type, such as auc, loss, ks, etc. - - metric namespace: custom metric namespace, e.g. train, predict - - metric name: custom metric name, e.g. auc0, hetero_lr_auc0 - - metric data: metric data in key-value form - - metric meta: metric meta information in key-value form, support flexible drawing -- API - - log_metric_data(metric_namespace, metric_name, metrics) - - set_metric_meta(metric_namespace, metric_name, metric_meta) - - get_metric_data(metric_namespace, metric_name) - - get_metric_meta(metric_namespace, metric_name) +- Total resource size for each engine is configured via a configuration file, subsequent system integration to be implemented +- The cores within the total resource size represent the number of CPU cores per computing node +- FATEFlow server reads resource size configuration from the configuration file upon startup and registers updates to the database +- Resources are allocated at the Job level, becoming effective upon Job Conf submission -## 6. [Realtime Monitoring](./fate_flow_monitoring.md) +## 5. Real-time Job Monitoring -- Job process survivability detection +- Work process liveness detection - Job timeout detection - Resource recovery detection -- Base engine session timeout detection +- Basic engine session timeout detection ![](./images/fate_flow_detector.png) -## 7. [Task Component Registry](./fate_flow_component_registry.md) +## 6. [Task Component Center](./provider_register.md) ![](./images/fate_flow_component_registry.png) -## 8. [Multi-Party Federated Model Registry](./fate_flow_model_registry.md) - -- Using Google Protocol Buffer as the model storage protocol, using cross-language sharing, each algorithmic model consists of two parts: ModelParam & ModelMeta -- A Pipeline generates a series of algorithmic models -- The model named Pipeline stores Pipeline modeling DSL and online inference DSL -- Under federal learning, model consistency needs to be guaranteed for all participants, i.e., model binding -- model_key is the model identifier defined by the user when submitting the task -- The model IDs of the federated parties are the party identification information role, party_id, plus model_key -- The model version of the federated parties must be unique and consistent, and FATE-Flow directly sets it to job_id - -![](./images/fate_flow_pipelined_model.png){: style="height:400px;width:450px"} - -![](./images/fate_flow_model_storage.png){: style="height:400px;width:800px"} - -## 9. [Data Access](./fate_flow_data_access.md) - -- Upload. - - External storage is imported directly to FATE Storage, creating a new DTable - - When the job runs, Reader reads directly from Storage - -- Table Bind. - - Key the external storage address to a new DTable in FATE - - When the job is running, Reader reads data from external storage via Meta and transfers it to FATE Storage - - Connecting to the Big Data ecosystem: HDFS, Hive/MySQL - -![](./images/fate_flow_inputoutput.png) - -## 10. [Multi-Party Collaboration Authority Management](./fate_flow_authority_management.md) +## 7. [Data Access](./data_access.md) -![](./images/fate_flow_authorization.png) \ No newline at end of file +![](./images/upload_data.png) diff --git a/doc/fate_flow.zh.md b/doc/fate_flow.zh.md index 0aa3ec000..adc1960b9 100644 --- a/doc/fate_flow.zh.md +++ b/doc/fate_flow.zh.md @@ -1,16 +1,14 @@ # 整体设计 -## 1. 逻辑架构 +## 1. 设计架构图 +![](./images/open_flow.png) +- 应用层接口:供如fate-board、fate-client等上层使用 +- 互联互通层接口:分为调度器接口和参与方接口,调度器接口用于接收如创建、停止等调度命令并下发给参与方,参与方接口用于各参与方接收如创建、运行、停止等命令并执行 +- 底座接口: 用于接收算法容器上报的状态等 +- 调度器:联邦调度逻辑,解析DSL依赖及运行相关的作业及任务 +- 算法容器:是算法运行的环境,FATE Flow支持算法运行在本地进程、算法容器中,其运行方式类似。 +- 平台资源池: 抽象计算、通信、存储API -- DSL定义作业 -- 自顶向下的纵向子任务流调度、多参与方联合子任务协调 -- 独立隔离的任务执行工作进程 -- 支持多类型多版本组件 -- 计算抽象API -- 存储抽象API -- 跨方传输抽象API - -![](./images/fate_flow_logical_arch.png) ## 2. 整体架构 @@ -18,13 +16,16 @@ ![](./images/fate_arch.png) -### 2.2 FATE Flow整体架构 +### 2.2 FATE Flow功能架构 ![](./images/fate_flow_arch.png) -## 3. [调度架构](./fate_flow_job_scheduling.zh.md) +### 2.3 FATE Flow集群架构 + +![](./images/flow_cluster.png) -### 3.1 基于共享状态的全新调度架构 +## 3. 调度架构 +### 3.1 基于共享状态的调度架构 - 剥离状态(资源、作业)与管理器(调度器、资源管理器) - 资源状态与作业状态持久化存于MySQL,全局共享,提供可靠事务性操作 @@ -35,37 +36,34 @@ ### 3.2 状态驱动调度 -- 资源协调 -- 拉起子进程Executor运行组件 -- Executor上报状态到本方Server,并且同时上报到调度方 -- 多方任务状态计算联邦任务状态 +- 南北向状态上报/查询 +- 东西向多方任务状态计算联邦任务状态 - 上下游任务状态计算作业作态 +#### 3.2.1 callback回调模式 +调度器创建作业和任务,由各参与方主动回调作业或任务的状态 + +![](./images/schedule_for_callback.png) +#### 3.2.2 poll轮询模式 +调度器不仅需创建作业和任务,在调度过程中会轮询参与方的作业或任务的状态 + +![](./images/schedule_for_poll.png) -![](./images/fate_flow_resource_process.png) +### 3.4 算法组件调度 +- 前处理: 数据、模型、算法参数等输入处理 +- 组件运行: 算法组件逻辑 +- 后处理: 算法组件输出内容处理 +![](./images/schedule_for_component.png) -## 4. [多方资源协调](./fate_flow_resource_management.zh.md) + +## 4. 多方资源协调 - 每个引擎总资源大小通过配置文件配置,后续实现系统对接 -- 总资源大小中的cores_per_node表示每个计算节点cpu核数,nodes表示计算节点个数 +- 总资源大小中的cores表示每个计算节点cpu核数 - FATEFlow server启动时从配置文件读取资源大小配置,并注册更新到数据库 -- 以Job维度申请资源,Job Conf提交时生效,公式:task_parallelism*task_cores -- 详细请看文档单独章节 - -## 5. [数据流动追踪](./fate_flow_tracking.zh.md) +- 以Job维度申请资源,Job Conf提交时生效 -- 定义 - - metric type: 指标类型,如auc, loss, ks等等 - - metric namespace: 自定义指标命名空间,如train, predict - - metric name: 自定义指标名称,如auc0,hetero_lr_auc0 - - metric data: key-value形式的指标数据 - - metric meta: key-value形式的指标元信息,支持灵活画图 -- API - - log_metric_data(metric_namespace, metric_name, metrics) - - set_metric_meta(metric_namespace, metric_name, metric_meta) - - get_metric_data(metric_namespace, metric_name) - - get_metric_meta(metric_namespace, metric_name) -## 6. [作业实时监测](./fate_flow_monitoring.zh.md) +## 5. 作业实时监测 - 工作进程存活性检测 - 作业超时检测 @@ -74,37 +72,10 @@ ![](./images/fate_flow_detector.png) -## 7. [任务组件中心](./fate_flow_component_registry.zh.md) +## 6. [任务组件中心](./provider_register.zh.md) ![](./images/fate_flow_component_registry.png) -## 8. [多方联合模型注册中心](./fate_flow_model_registry.zh.md) - -- 使用Google Protocol Buffer作为模型存储协议,利用跨语言共享,每个算法模型由两部分组成:ModelParam & ModelMeta -- 一个Pipeline产生一系列算法模型 -- 命名为Pipeline的模型存储Pipeline建模DSL及在线推理DSL -- 联邦学习下,需要保证所有参与方模型一致性,即模型绑定 -- model_key为用户提交任务时定义的模型标识 -- 联邦各方的模型ID由本方标识信息role、party_id,加model_key -- 联邦各方的模型版本必须唯一且保持一致,FATE-Flow直接设置为job_id - -![](./images/fate_flow_pipelined_model.png){: style="height:400px;width:450px"} - -![](./images/fate_flow_model_storage.png){: style="height:400px;width:800px"} - -## 9. [数据接入](./fate_flow_data_access.zh.md) - -- Upload: - - 外部存储直接导入到FATE Storage,创建一个新的DTable - - 作业运行时,Reader直接从Storage读取 - -- Table Bind: - - 外部存储地址关键到FATE一个新的DTable - - 作业运行时,Reader通过Meta从外部存储读取数据并转存到FATE Storage - - 打通大数据生态:HDFS,Hive/MySQL - -![](./images/fate_flow_inputoutput.png) - -## 10. [多方合作权限管理](./fate_flow_authority_management.zh.md) +## 7. [数据接入](./data_access.zh.md) -![](./images/fate_flow_authorization.png) +![](./images/upload_data.png) diff --git a/doc/images/fate_arch.png b/doc/images/fate_arch.png index bd8b2eda6..2ce3bd519 100644 Binary files a/doc/images/fate_arch.png and b/doc/images/fate_arch.png differ diff --git a/doc/images/fate_flow_arch.png b/doc/images/fate_flow_arch.png index 2bb3e3e3d..bb15d1b30 100644 Binary files a/doc/images/fate_flow_arch.png and b/doc/images/fate_flow_arch.png differ diff --git a/doc/images/fate_flow_component_registry.png b/doc/images/fate_flow_component_registry.png index ed2e8f780..a8c5c8543 100644 Binary files a/doc/images/fate_flow_component_registry.png and b/doc/images/fate_flow_component_registry.png differ diff --git a/doc/images/fate_flow_logical_arch.png b/doc/images/fate_flow_logical_arch.png deleted file mode 100644 index 4c7677dac..000000000 Binary files a/doc/images/fate_flow_logical_arch.png and /dev/null differ diff --git a/doc/images/fate_flow_resource_process.png b/doc/images/fate_flow_resource_process.png deleted file mode 100644 index b7275790c..000000000 Binary files a/doc/images/fate_flow_resource_process.png and /dev/null differ diff --git a/doc/images/flow_cluster.png b/doc/images/flow_cluster.png new file mode 100644 index 000000000..d23bb6b0e Binary files /dev/null and b/doc/images/flow_cluster.png differ diff --git a/doc/images/muti_protocol.png b/doc/images/muti_protocol.png new file mode 100644 index 000000000..5ceee5dbd Binary files /dev/null and b/doc/images/muti_protocol.png differ diff --git a/doc/images/open_flow.png b/doc/images/open_flow.png index 538f49546..d6bb88c42 100644 Binary files a/doc/images/open_flow.png and b/doc/images/open_flow.png differ diff --git a/doc/images/open_flow/pipeline_unionpay_lr.png b/doc/images/open_flow/pipeline_unionpay_lr.png new file mode 100644 index 000000000..3787f6fac Binary files /dev/null and b/doc/images/open_flow/pipeline_unionpay_lr.png differ diff --git a/doc/images/open_flow/pipeline_unionpay_sbt.png b/doc/images/open_flow/pipeline_unionpay_sbt.png new file mode 100644 index 000000000..e616e533b Binary files /dev/null and b/doc/images/open_flow/pipeline_unionpay_sbt.png differ diff --git a/doc/images/open_flow/upload_data.png b/doc/images/open_flow/upload_data.png new file mode 100644 index 000000000..1e8a6c0aa Binary files /dev/null and b/doc/images/open_flow/upload_data.png differ diff --git a/doc/images/schedule_for_component.png b/doc/images/schedule_for_component.png index 8f35e3ad4..79e6d4200 100644 Binary files a/doc/images/schedule_for_component.png and b/doc/images/schedule_for_component.png differ diff --git a/doc/images/upload_data.png b/doc/images/upload_data.png new file mode 100644 index 000000000..ddc41d65e Binary files /dev/null and b/doc/images/upload_data.png differ diff --git a/doc/job_scheduling.md b/doc/job_scheduling.md new file mode 100644 index 000000000..dad1cdb73 --- /dev/null +++ b/doc/job_scheduling.md @@ -0,0 +1,495 @@ +# Multi-Party Joint Operation + +## 1. Introduction + +This primarily introduces how to define federated learning jobs using `FATE Flow`. + +## 2. DAG Definition + +FATE 2.0 uses a brand new DAG to define a job, including the upstream and downstream dependencies of each component. + +## 3. Job Functional Configuration + +### 3.1 Prediction +```yaml +dag: + conf: + model_warehouse: + model_id: '202307171452088269870' + model_version: '0' +``` +In `dag.conf.model_warehouse`, define the model information that the prediction task relies on. This model will be used for prediction in the algorithm. + +### 3.2 Job Inheritance +```yaml +dag: + conf: + inheritance: + job_id: "202307041704214920920" + task_list: ["reader_0"] +``` +In `job.conf.inheritance`, fill in the job and algorithm component names that need to be inherited. The newly started job will directly reuse the outputs of these components. + +### 3.3 Specifying the Scheduler Party +```yaml +dag: + conf: + scheduler_party_id: "9999" +``` +In `job.conf.scheduler_party_id`, you can specify scheduler party information. If not specified, the initiator acts as the scheduler. + +### 3.4 Specifying Job Priority +```yaml +dag: + conf: + priority: 2 +``` +In `job.conf.priority`, specify the scheduling weight of the task. The higher the value, the higher the priority. + +### 3.5 Automatic Retry on Failure +```yaml +dag: + conf: + auto_retries: 2 +``` +In `job.conf.auto_retries`, specify the number of retries if a task fails. Default is 0. + +### 3.6 Resource Allocation +```yaml +dag: + conf: + cores: 4 + task: + engine_run: + cores: 2 +``` +- Here, `dag.conf.cores` represents the allocated resources for the entire job (`job_cores`), and `dag.conf.engine_run.cores` represents the allocated resources for the task (`task_cores`). If a job is started with this configuration, its maximum parallelism will be 2. +- Task parallelism = job_cores / task_cores + +### 3.7 Task Timeout +```yaml +dag: + task: + timeout: 3600 # s +``` +In `dag.task.timeout`, specify the task's timeout. When a task is in the 'running' state after reaching the timeout, it triggers an automatic job kill operation. + +### 3.8 Task Provider +```yaml +dag: + task: + provider: fate:2.0.1@local +``` +In `dag.task.provider`, specify the algorithm provider, version number, and execution mode for the task. + +## 4. Input +**Description:** Upstream input, divided into two input types: data and models. + +### 4.1 Data Input +- As parameter input to a component +```yaml +dag: + party_tasks: + guest_9999: + tasks: + reader_0: + parameters: + name: breast_hetero_guest + namespace: experiment + host_9998: + tasks: + reader_0: + parameters: + name: breast_hetero_host + namespace: experiment +``` +The `reader` component supports directly passing a FATE data table as job-level data input. + +- Input of one component from another component's output +```yaml +dag: + tasks: + binning_0: + component_ref: hetero_feature_binning + inputs: + data: + train_data: + task_output_artifact: + output_artifact_key: train_output_data + producer_task: scale_0 +``` +`binning_0` depends on the output data of `scale_0`. + +### 4.2 Model Input +- Model Warehouse +```yaml +dag: + conf: + model_warehouse: + model_id: '202307171452088269870' + model_version: '0' + tasks: + selection_0: + component_ref: hetero_feature_selection + dependent_tasks: + - scale_0 + model: + input_model: + model_warehouse: + output_artifact_key: train_output_model + producer_task: selection_0 +``` + +## 5. Output +The job's output includes data, models, and metrics. + +### 5.1 Metric Output +#### Querying Metrics +Querying output metrics command: +```shell +flow output query-metric -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output query-metric -j 202308211911505128750 -r arbiter -p 9998 -tn lr_0` +- Input content as follows: +```json +{ + "code": 0, + "data": [ + { + "data": [ + { + "metric": [ + 0.0 + ], + "step": 0, + "timestamp": 1692616428.253495 + } + ], + "groups": [ + { + "index": null, + "name": "default" + }, + { + "index": null, + "name": "train" + } + ], + "name": "lr_loss", + "step_axis": "iterations", + "type": "loss" + }, + { + "data": [ + { + "metric": [ + -0.07785049080848694 + ], + "step": 1, + "timestamp": 1692616432.9727712 + } + ], + "groups": [ + { + "index": null, + "name": "default" + }, + { + "index": null, + "name": "train" + } + ], + "name": "lr_loss", + "step_axis": "iterations", + "type": "loss" + } + ], + "message": "success" +} + +``` + + +### 5.2 Model Output +#### Querying Models +```shell +flow output query-model -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output query-model -j 202308211911505128750 -r host -p 9998 -tn lr_0` +- Query result as follows: +```json +{ + "code": 0, + "data": { + "output_model": { + "data": { + "estimator": { + "end_epoch": 10, + "is_converged": false, + "lr_scheduler": { + "lr_params": { + "start_factor": 0.7, + "total_iters": 100 + }, + "lr_scheduler": { + "_get_lr_called_within_step": false, + "_last_lr": [ + 0.07269999999999996 + ], + "_step_count": 10, + "base_lrs": [ + 0.1 + ], + "end_factor": 1.0, + "last_epoch": 9, + "start_factor": 0.7, + "total_iters": 100, + "verbose": false + }, + "method": "linear" + }, + "optimizer": { + "alpha": 0.001, + "l1_penalty": false, + "l2_penalty": true, + "method": "sgd", + "model_parameter": [ + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ] + ], + "model_parameter_dtype": "float32", + "optim_param": { + "lr": 0.1 + }, + "optimizer": { + "param_groups": [ + { + "dampening": 0, + "differentiable": false, + "foreach": null, + "initial_lr": 0.1, + "lr": 0.07269999999999996, + "maximize": false, + "momentum": 0, + "nesterov": false, + "params": [ + 0 + ], + "weight_decay": 0 + } + ], + "state": {} + } + }, + "param": { + "coef_": [ + [ + -0.10828543454408646 + ], + [ + -0.07341302931308746 + ], + [ + -0.10850320011377335 + ], + [ + -0.10066638141870499 + ], + [ + -0.04595951363444328 + ], + [ + -0.07001449167728424 + ], + [ + -0.08949052542448044 + ], + [ + -0.10958756506443024 + ], + [ + -0.04012322425842285 + ], + [ + 0.02270071767270565 + ], + [ + -0.07198350876569748 + ], + [ + 0.00548586156219244 + ], + [ + -0.06599288433790207 + ], + [ + -0.06410090625286102 + ], + [ + 0.016374297440052032 + ], + [ + -0.01607361063361168 + ], + [ + -0.011447405442595482 + ], + [ + -0.04352564364671707 + ], + [ + 0.013161249458789825 + ], + [ + 0.013506329618394375 + ] + ], + "dtype": "float32", + "intercept_": null + } + } + }, + "meta": { + "batch_size": null, + "epochs": 10, + "init_param": { + "fill_val": 0.0, + "fit_intercept": false, + "method": "zeros", + "random_state": null + }, + "label_count": false, + "learning_rate_param": { + "method": "linear", + "scheduler_params": { + "start_factor": 0.7, + "total_iters": 100 + } + }, + "optimizer_param": { + "alpha": 0.001, + "method": "sgd", + "optimizer_params": { + "lr": 0.1 + }, + "penalty": "l2" + }, + "ovr": false + } + } + }, + "message": "success" +} + +``` + +#### Downloading Models +```shell +flow output download-model -j $job_id -r $role -p $party_id -tn $task_name -o $download_dir +``` +- `flow output download-model -j 202308211911505128750 -r host -p 9998 -tn lr_0 -o ./` +- Download result: +```json +{ + "code": 0, + "directory": "./output_model_202308211911505128750_host_9998_lr_0", + "message": "Download success, please check the path: ./output_model_202308211911505128750_host_9998_lr_0" +} +``` + +### 5.3 Output Data +#### Querying Data Tables +```shell +flow output query-data-table -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output query-data-table -j 202308211911505128750 -r host -p 9998 -tn binning_0` +- Query result: +```json +{ + "train_output_data": [ + { + "name": "9e28049c401311ee85c716b977118319", + "namespace": "202308211911505128750_binning_0" + } + ] +} +``` + +#### Previewing Data +```shell +flow output display-data -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output display-data -j 202308211911505128750 -r host -p 9998 -tn binning_0` + +#### Downloading Data +```shell +flow output download-data -j $job_id -r $role -p $party_id -tn $task_name -o $download_dir +``` +- `flow output download-data -j 202308211911505128750 -r guest -p 9999 -tn lr_0 -o ./` +- Result: +```json +{ + "code": 0, + "directory": "./output_data_202308211911505128750_guest_9999_lr_0", + "message": "Download success, please check the path: ./output_data_202308211911505128750_guest_9999_lr_0" +} +``` + diff --git a/doc/job_scheduling.zh.md b/doc/job_scheduling.zh.md new file mode 100644 index 000000000..14f96708b --- /dev/null +++ b/doc/job_scheduling.zh.md @@ -0,0 +1,498 @@ +# 多方联合作业 + +## 1. 说明 + +主要介绍如何使用`FATE Flow`联邦学习作业的定义 + +## 2. DAG定义 +FATE 2.0采用全新DAG定义一个作业,包含各个组件的上下依赖关系 + +## 3. 作业功能配置 +### 3.1 预测 +```yaml +dag: + conf: + model_warehouse: + model_id: '202307171452088269870' + model_version: '0' +``` +在dag.conf.model_warehouse中定义预测任务依赖的模型信息,在算法中将使用此模型进行预测 + +### 3.2 job继承 +```yaml +dag: + conf: + inheritance: + job_id: "202307041704214920920" + task_list: ["reader_0"] +``` +在job.conf.inheritance中填入需要继承的job和算法组件名,新启动的job将直接复用这些组件的输出 + +### 3.3 指定调度方 +```yaml +dag: + conf: + scheduler_party_id: "9999" +``` +在job.conf.scheduler_party_id中可指定调度方信息,若不指定则由发起方充当调度方 + +### 3.4 指定作业优先级 +```yaml +dag: + conf: + priority: 2 +``` +在job.conf.priority中指定任务的调度权重,数值越大,优先级越高 + +### 3.5 失败自动重试 + +```yaml +dag: + conf: + auto_retries: 2 +``` +在job.conf.auto_retries中指定任务失败重试次数,默认为0。 + +### 3.6 资源数 +```yaml +dag: + conf: + cores: 4 + task: + engine_run: + cores: 2 +``` +- 其中, dag.conf.cores为整个job的分配资源数(job_cores),dag.conf.engine_run.cores为task的分配资源数(task_cores)。若以此配置启动job,其最大并行度为2。 +- task并行度 = job_cores / task_cores + +### 3.7 任务超时时间 +```yaml +dag: + task: + timeout: 3600 # s +``` +在dag.task.timeout中指定task的超时时间。当任务在达到超时时间还处于running状态时,会触发自动kill job操作 + +### 3.8 任务provider +```yaml +dag: + task: + provider: fate:2.0.1@local +``` +在dag.task.provider中指定task的算法提供者、版本号和运行模式 + +## 4. 输入 +**描述** 上游输入,分为两种输入类型,分别是数据和模型。 + +### 4.1 数据输入 +- 作为组件的参数输入 +```yaml +dag: + party_tasks: + guest_9999: + tasks: + reader_0: + parameters: + name: breast_hetero_guest + namespace: experiment + host_9998: + tasks: + reader_0: + parameters: + name: breast_hetero_host + namespace: experiment +``` +reader组件支持直接传入某份fate数据表作为job级的数据输入。 + +- 某个组件输入另外一个组件的输出 +```yaml +dag: + tasks: + binning_0: + component_ref: hetero_feature_binning + inputs: + data: + train_data: + task_output_artifact: + output_artifact_key: train_output_data + producer_task: scale_0 +``` +binning_0依赖scale_0的输出数据train_output_data + +### 4.2 模型输入 +- model warehouse +```yaml +dag: + conf: + model_warehouse: + model_id: '202307171452088269870' + model_version: '0' + tasks: + selection_0: + component_ref: hetero_feature_selection + dependent_tasks: + - scale_0 + model: + input_model: + model_warehouse: + output_artifact_key: train_output_model + producer_task: selection_0 +``` + + +## 5. 输出 +作业的输出包括数据、模型和指标 +### 5.1 输出指标 +#### 查询指标 +查询输出指标命令: +```shell +flow output query-metric -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output query-metric -j 202308211911505128750 -r arbiter -p 9998 -tn lr_0` +- 输入内容如下 +```json +{ + "code": 0, + "data": [ + { + "data": [ + { + "metric": [ + 0.0 + ], + "step": 0, + "timestamp": 1692616428.253495 + } + ], + "groups": [ + { + "index": null, + "name": "default" + }, + { + "index": null, + "name": "train" + } + ], + "name": "lr_loss", + "step_axis": "iterations", + "type": "loss" + }, + { + "data": [ + { + "metric": [ + -0.07785049080848694 + ], + "step": 1, + "timestamp": 1692616432.9727712 + } + ], + "groups": [ + { + "index": null, + "name": "default" + }, + { + "index": null, + "name": "train" + } + ], + "name": "lr_loss", + "step_axis": "iterations", + "type": "loss" + } + ], + "message": "success" +} + +``` + + +### 5.2 输出模型 +#### 查询模型 +```shell +flow output query-model -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output query-model -j 202308211911505128750 -r host -p 9998 -tn lr_0` +- 查询结果如下: +```json +{ + "code": 0, + "data": { + "output_model": { + "data": { + "estimator": { + "end_epoch": 10, + "is_converged": false, + "lr_scheduler": { + "lr_params": { + "start_factor": 0.7, + "total_iters": 100 + }, + "lr_scheduler": { + "_get_lr_called_within_step": false, + "_last_lr": [ + 0.07269999999999996 + ], + "_step_count": 10, + "base_lrs": [ + 0.1 + ], + "end_factor": 1.0, + "last_epoch": 9, + "start_factor": 0.7, + "total_iters": 100, + "verbose": false + }, + "method": "linear" + }, + "optimizer": { + "alpha": 0.001, + "l1_penalty": false, + "l2_penalty": true, + "method": "sgd", + "model_parameter": [ + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ], + [ + 0.0 + ] + ], + "model_parameter_dtype": "float32", + "optim_param": { + "lr": 0.1 + }, + "optimizer": { + "param_groups": [ + { + "dampening": 0, + "differentiable": false, + "foreach": null, + "initial_lr": 0.1, + "lr": 0.07269999999999996, + "maximize": false, + "momentum": 0, + "nesterov": false, + "params": [ + 0 + ], + "weight_decay": 0 + } + ], + "state": {} + } + }, + "param": { + "coef_": [ + [ + -0.10828543454408646 + ], + [ + -0.07341302931308746 + ], + [ + -0.10850320011377335 + ], + [ + -0.10066638141870499 + ], + [ + -0.04595951363444328 + ], + [ + -0.07001449167728424 + ], + [ + -0.08949052542448044 + ], + [ + -0.10958756506443024 + ], + [ + -0.04012322425842285 + ], + [ + 0.02270071767270565 + ], + [ + -0.07198350876569748 + ], + [ + 0.00548586156219244 + ], + [ + -0.06599288433790207 + ], + [ + -0.06410090625286102 + ], + [ + 0.016374297440052032 + ], + [ + -0.01607361063361168 + ], + [ + -0.011447405442595482 + ], + [ + -0.04352564364671707 + ], + [ + 0.013161249458789825 + ], + [ + 0.013506329618394375 + ] + ], + "dtype": "float32", + "intercept_": null + } + } + }, + "meta": { + "batch_size": null, + "epochs": 10, + "init_param": { + "fill_val": 0.0, + "fit_intercept": false, + "method": "zeros", + "random_state": null + }, + "label_count": false, + "learning_rate_param": { + "method": "linear", + "scheduler_params": { + "start_factor": 0.7, + "total_iters": 100 + } + }, + "optimizer_param": { + "alpha": 0.001, + "method": "sgd", + "optimizer_params": { + "lr": 0.1 + }, + "penalty": "l2" + }, + "ovr": false + } + } + }, + "message": "success" +} + +``` + +#### 下载模型 +```shell +flow output download-model -j $job_id -r $role -p $party_id -tn $task_name -o $download_dir +``` +- `flow output download-model -j 202308211911505128750 -r host -p 9998 -tn lr_0 -o ./` +- 下载结果如下: +```json +{ + "code": 0, + "directory": "./output_model_202308211911505128750_host_9998_lr_0", + "message": "download success, please check the path: ./output_model_202308211911505128750_host_9998_lr_0" +} + + +``` + + +### 5.3 输出数据 +#### 查询数据表 +```shell +flow output query-data-table -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output query-data-table -j 202308211911505128750 -r host -p 9998 -tn binning_0` +- 查询结果如下: +```json +{ + "train_output_data": [ + { + "name": "9e28049c401311ee85c716b977118319", + "namespace": "202308211911505128750_binning_0" + } + ] +} +``` + +#### 预览数据 +```shell +flow output display-data -j $job_id -r $role -p $party_id -tn $task_name +``` +- `flow output display-data -j 202308211911505128750 -r host -p 9998 -tn binning_0` + +#### 下载数据 +```shell +flow output download-data -j $job_id -r $role -p $party_id -tn $task_name -o $download_dir +``` +- `flow output download-data -j 202308211911505128750 -r guest -p 9999 -tn lr_0 -o ./` +- 结果如下: +```json +{ + "code": 0, + "directory": "./output_data_202308211911505128750_guest_9999_lr_0", + "message": "download success, please check the path: ./output_data_202308211911505128750_guest_9999_lr_0" +} + +``` + diff --git a/doc/provider_register.md b/doc/provider_register.md new file mode 100644 index 000000000..37c93c500 --- /dev/null +++ b/doc/provider_register.md @@ -0,0 +1,147 @@ +# Component Registry + +## 1. Introduction +FATE Flow has designed an algorithm component registry module to support multiple algorithm vendors, versions, and various execution modes. + +## 2. Provider +*Definition*: `$name:$version@device`, such as `fate:2.0.0@local` +- name: Algorithm provider vendor +- version: Algorithm version +- device: Algorithm execution mode, e.g., docker, k8s, local, etc. + +### 2.1 Registration +- Registration command: + +```shell +flow provider register -c examples/provider/register.json +``` + +- Registering a local algorithm package requires providing the algorithm package path (`path`) and optionally the Python environment path (if not provided, the system environment will be used). +```json +{ + "name": "fate", + "device": "local", + "version": "2.0.1", + "metadata": { + "path": "/Users/tonly/FATE/python", + "venv": "/Users/tonly/opt/anaconda3/envs/fate3.8/bin/python" + } +} +``` + +- Registering a docker-based algorithm image: +```json +{ + "name": "fate", + "device": "docker", + "version": "2.0.0", + "metadata": { + "base_url": "", + "image": "federatedai/fate:2.0.0" + }, + "protocol": "bfia", + "components_description": {} +} +``` + +### 2.2 Querying + +- Command: +```shell +flow provider register --name fate --version 2.0.1 --device local +``` + +- Output: +```json +{ + "code": 0, + "data": [ + { + "create_time": 1703762542058, + "device": "local", + "metadata": { + "path": "/Users/tonly/FATE/python", + "venv": "/Users/tonly/opt/anaconda3/envs/fate3.8/bin/python" + }, + "name": "fate", + "provider_name": "fate:2.0.1@local", + "update_time": 1703762542058, + "version": "2.0.1" + } + ], + "message": "success" +} +``` + +### 2.3 Deletion +Used for deleting a registered algorithm. +- Command: +```shell +flow provider delete --name fate --version 2.0.1 --device local +``` + +- Output: +```json +{ + "code": 0, + "data": true, + "message": "success" +} +``` + +### 3. Component Registry and Discovery Mechanism +- Registering algorithms +- Task configuration carrying the provider parameter, see the [configuration methods](#configuration-methods) below + +![Component Registry](./images/fate_flow_component_registry.png) + +### 4. Configuration Methods +### 4.1 Global Job Configuration +```yaml +dag: + conf: + task: + provider: fate:2.0.1@local +``` +All tasks under the job inherit this provider. + +### 4.2 Global Party Task Configuration +```yaml +dag: + party_tasks: + guest_9999: + parties: + - party_id: + - '9999' + role: guest + conf: + provider: fate:2.0.1@local +``` +All tasks under guest 9999 inherit this provider. + +### 4.3 Global Task Configuration +```yaml +dag: + tasks: + reader_0: + conf: + provider: fate:2.0.1@local + component_ref: reader +``` +All reader components across all parties inherit this provider. + +### 4.4 Specified Task Configuration +```yaml +dag: + party_tasks: + guest_9999: + parties: + - party_id: + - '9999' + role: guest + tasks: + reader_0: + conf: + provider: fate:2.0.1@local +``` +The reader component under guest 9999 specifically inherits this provider. \ No newline at end of file diff --git a/doc/provider_register.zh.md b/doc/provider_register.zh.md new file mode 100644 index 000000000..df0d17521 --- /dev/null +++ b/doc/provider_register.zh.md @@ -0,0 +1,145 @@ +# 组件注册中心 + +## 1. 说明 +FATE Flow设计了算法组件注册模块,以支持多个算法开发厂商、多个版本及多种运行模式。 + +## 2. provider +*定义*: `$name:$version@device`, 如`fate:2.0.0@local` +- name: 算法提供厂商 +- version: 算法版本 +- device: 算法运行模式,如: docker, k8s, local等 + +### 2.1 注册 +- 注册命令: + +```shell +flow provider register -c examples/provider/register.json +``` +- 注册本地的算法包,需要传入算法包路径(path)和python环境路径(可选,若不传将使用系统的环境) +```json +{ + "name": "fate", + "device": "local", + "version": "2.0.1", + "metadata": { + "path": "/Users/tonly/FATE/python", + "venv": "/Users/tonly/opt/anaconda3/envs/fate3.8/bin/python" + } +} +``` + +- 注册docker算法镜像包 +```json +{ + "name": "fate", + "device": "docker", + "version": "2.0.0", + "metadata": { + "base_url": "", + "image": "federatedai/fate:2.0.0" + }, + "protocol": "bfia", + "components_description": {} +} +``` + +### 2.2 查询 + +- 命令: +```shell +flow provider register --name fate --version 2.0.1 --device local +``` +- 输出: +```json +{ + "code": 0, + "data": [ + { + "create_time": 1703762542058, + "device": "local", + "metadata": { + "path": "/Users/tonly/FATE/python", + "venv": "/Users/tonly/opt/anaconda3/envs/fate3.8/bin/python" + }, + "name": "fate", + "provider_name": "fate:2.0.1@local", + "update_time": 1703762542058, + "version": "2.0.1" + } + ], + "message": "success" +} + +``` + +### 2.3 删除 +用于删除已经注册的算法 +- 命令: +```shell +flow provider delete --name fate --version 2.0.1 --device local +``` +- 输出: +```json +{ + "code": 0, + "data": true, + "message": "success" +} +``` + +### 3.组件注册与发现机制 +- 注册算法 +- 任务配置中携带provider参数,详见如下[配置方法](#配置方法) + +![](./images/fate_flow_component_registry.png) + +### 4. 配置方法 +### 4.1 全局job配置 +```yaml +dag: + conf: + task: + provider: fate:2.0.1@local +``` +job下的所有task继承此provider + +### 4.2 全局参与方task配置 +```yaml +dag: + party_tasks: + guest_9999: + parties: + - party_id: + - '9999' + role: guest + conf: + provider: fate:2.0.1@local +``` +guest 9999下的所有task继承此provider + +### 4.3 全局task配置 +```yaml +dag: + tasks: + reader_0: + conf: + provider: fate:2.0.1@local + component_ref: reader +``` +所有参与方的reader组件继承此provider + +### 4.4 指定task配置 +```yaml +dag: + party_tasks: + guest_9999: + parties: + - party_id: + - '9999' + role: guest + tasks: + reader_0: + conf: + provider: fate:2.0.1@local +``` +guest 9999下reader组件继承此provider diff --git a/doc/quick_start.zh.md b/doc/quick_start.zh.md index 90176526b..26bf5a8d9 100644 --- a/doc/quick_start.zh.md +++ b/doc/quick_start.zh.md @@ -14,7 +14,7 @@ conda activate fate_env ``` - 安装fate flow及相关依赖 ```shell -pip install fate_client[fate,fate_flow]==2.0.0.b0 +pip install fate_client[fate,fate_flow]==2.0.0 ``` #### 1.1.2 服务初始化 @@ -31,137 +31,34 @@ fate_flow status/start/stop/restart ``` ### 1.2 单机版部署 -参考[单机版部署](https://github.com/FederatedAI/FATE/tree/v2.0.0-beta/deploy/standalone-deploy/README.zh.md) +参考[单机版部署](https://github.com/FederatedAI/FATE/tree/dev-2.0.0-rc/deploy/standalone-deploy/README.zh.md) ### 1.3 集群部署 -参考[allinone部署](https://github.com/FederatedAI/FATE/tree/v2.0.0-beta/deploy/cluster-deploy/allinone/fate-allinone_deployment_guide.zh.md) +参考[allinone部署](https://github.com/FederatedAI/FATE/tree/dev-2.0.0-rc/deploy/cluster-deploy/allinone/fate-allinone_deployment_guide.zh.md) ## 2. 使用指南 fate提供的客户端包括SDK、CLI和Pipeline,若你的环境中没有部署FATE Client,可以使用`pip install fate_client`下载,以下的使用操作均基于cli编写。 ### 2.1 数据上传 -在2.0-beta版本中,数据上传分为两步: -- upload: 将数据上传到FATE支持存储服务中 -- transformer: 将数据转化成dataframe -#### 2.1.1 upload -#### 2.1.1.1 配置及数据 - - 上传配置位于[examples-upload](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0-beta/examples/upload),上传数据位于[upload-data](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0-beta/examples/data) - - 你也可以使用自己的数据,并修改upload配置中的"meta"信息。 -#### 2.1.1.2 上传guest方数据 +更详细的数据操作指南可参考:[数据接入指南](data_access.zh.md) +### 2.1.1 配置及数据 + - 上传配置: [examples-upload](https://github.com/FederatedAI/FATE-Flow/tree/dev-2.0.0-rc/examples/upload) + - 上传数据: [upload-data](https://github.com/FederatedAI/FATE-Flow/tree/dev-2.0.0-rc/examples/data) +### 2.1.2 上传guest方数据 ```shell flow data upload -c examples/upload/upload_guest.json ``` -- 需要记录返回的name和namespace,作为transformer的参数。 -#### 2.1.1.3 上传host方数据 +### 2.1.3 上传host方数据 ```shell flow data upload -c examples/upload/upload_host.json ``` -- 需要记录返回的name和namespace,作为transformer的参数。 -#### 2.1.1.4 上传结果 -```json -{ - "code": 0, - "data": { - "name": "36491bc8-3fef-11ee-be05-16b977118319", - "namespace": "upload" - }, - "job_id": "202308211451535620150", - "message": "success" -} -``` -其中"namespace"和"name"是这份数据在fate中的标识,以便下面后续transformer阶段使用时可直接引用。 - -#### 2.1.1.5 数据查询 -因为upload为异步操作,需要确认是否上传成功才可进行后续操作。 -```shell -flow table query --namespace upload --name 36491bc8-3fef-11ee-be05-16b977118319 -``` -上传成功信息如下: -```json -{ - "code": 0, - "data": { - "count": 569, - "data_type": "table", - "engine": "standalone", - "meta": { - "delimiter": ",", - "dtype": "'float32", - "header": "extend_sid,id,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19", - "input_format": "dense", - "label_type": "int", - "match_id_name": "id", - "match_id_range": 0, - "sample_id_name": "extend_sid", - "tag_value_delimiter": ":", - "tag_with_value": false, - "weight_type": "float32" - }, - "name": "36491bc8-3fef-11ee-be05-16b977118319", - "namespace": "upload", - "path": "xxx", - "source": { - "component": "upload", - "output_artifact_key": "data", - "output_index": null, - "party_task_id": "", - "task_id": "", - "task_name": "upload" - } - }, - "message": "success" -} - -``` -若返回的code为0即为上传成功。 - -#### 2.1.2 transformer -#### 2.1.2.1 配置 - - transformer配置位于[examples-transformer](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0-beta/examples/transformer) -#### 2.1.2.2 transformer guest -- 配置路径位于: examples/transformer/transformer_guest.json -- 修改配置中"data_warehouse"的"namespace"和"name":上面upload guest阶段的输出 -```shell -flow data transformer -c examples/transformer/transformer_guest.json -``` -#### 2.1.2.3 transformer host -- 配置路径位于: examples/transformer/transformer_host.json -- 修改配置中"data_warehouse"的"namespace"和"name":上面upload host阶段的输出 -```shell -flow data transformer -c examples/transformer/transformer_host.json -``` -#### 2.1.2.4 transformer结果 -```json -{ - "code": 0, - "data": { - "name": "breast_hetero_guest", - "namespace": "experiment" - }, - "job_id": "202308211557455662860", - "message": "success" -} -``` -其中"namespace"和"name"是这份数据在fate中的标识,后续建模作业中使用。 - -#### 2.1.2.5 查看数据是否上传成功 - -因为transformer也是异步操作,需要确认是否上传成功才可进行后续操作。 -```shell -flow table query --namespace experiment --name breast_hetero_guest -``` -```shell -flow table query --namespace experiment --name breast_hetero_host -``` -若返回的code为0即为上传成功。 ### 2.2 开始FATE作业 #### 2.2.1 提交作业 当你的数据准备好后,可以开始提交作业给FATE Flow: -- 训练job配置example位于[lr-train](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0-beta/examples/lr/train_lr.yaml); -- 预测job配置example位于[lr-predict](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0-beta/examples/lr/predict_lr.yaml);预测任务需要修改"dag.conf.model_warehouse"成训练作业的输出模型。 -- 训练和预测job配置中站点id为"9998"和"9999"。如果你的部署环境为集群版,需要替换成真实的站点id;单机版可使用默认配置。 -- 如果想要使用自己的数据,可以更改配置中guest和host的data_warehouse的namespace和name +- job配置example位于[lr-train](https://github.com/FederatedAI/FATE-Flow/tree/dev-2.0.0-rc/examples/lr/train_lr.yaml); +- job配置中站点id为"9998"和"9999"。如果你的部署环境为集群版,需要替换成真实的站点id;单机版可使用默认配置。 +- 如果想要使用自己的数据,可以更改配置中reader的参数。 - 提交作业的命令为: ```shell flow job submit -c examples/lr/train_lr.yaml @@ -555,8 +452,8 @@ flow output download-data -j $job_id -r $role -p $party_id -tn $task_name -o $do ``` ## 3.更多文档 -- [Restful-api](https://github.com/FederatedAI/FATE-Flow/tree/v2.0.0-beta/doc/swagger/swagger.yaml) -- [CLI](https://github.com/FederatedAI/FATE-Client/tree/v2.0.0-beta/python/fate_client/flow_cli/build/doc) -- [Pipeline](https://github.com/FederatedAI/FATE/tree/v2.0.0-beta/doc/tutorial) -- [FATE快速开始](https://github.com/FederatedAI/FATE/tree/v2.0.0-beta/doc/2.0/quick_start.md) -- [FATE算法](https://github.com/FederatedAI/FATE/tree/v2.0.0-beta/doc/2.0/fate) +- [Restful-api](https://github.com/FederatedAI/FATE-Flow/tree/dev-2.0.0-rc/doc/swagger/swagger.yaml) +- [CLI](https://github.com/FederatedAI/FATE-Client/tree/dev-2.0.0-rc/python/fate_client/flow_cli/build/doc) +- [Pipeline](https://github.com/FederatedAI/FATE/tree/dev-2.0.0-rc/doc/tutorial) +- [FATE快速开始](https://github.com/FederatedAI/FATE/tree/dev-2.0.0-rc/doc/2.0/quick_start.md) +- [FATE算法](https://github.com/FederatedAI/FATE/tree/dev-2.0.0-rc/doc/2.0/fate) diff --git a/doc/system_conf.md b/doc/system_conf.md index d8dc45843..82ba92e43 100644 --- a/doc/system_conf.md +++ b/doc/system_conf.md @@ -85,36 +85,6 @@ default_engines: - device: Algorithm launch mode, local/docker/k8s, etc. ## Communication Engine Pool -### Pulsar -```yaml -pulsar: - host: 192.168.0.5 - port: 6650 - mng_port: 8080 - cluster: standalone - tenant: fl-tenant - topic_ttl: 30 - route_table: - mode: replication - max_message_size: 1048576 -``` -### Nginx: -```yaml -nginx: - host: 127.0.0.1 - http_port: 9300 - grpc_port: 9310 - protocol: http -``` - -### RabbitMQ -```yaml -nginx: - host: 127.0.0.1 - http_port: 9300 - grpc_port: 9310 - protocol: http -``` ### OSx ```yaml @@ -145,7 +115,7 @@ eggroll: ### Spark ```yaml -eggroll: +spark: home: cores: 32 ``` diff --git a/doc/system_conf.zh.md b/doc/system_conf.zh.md index 5c8f910f5..c10671f44 100644 --- a/doc/system_conf.zh.md +++ b/doc/system_conf.zh.md @@ -85,41 +85,6 @@ default_engines: - device: 算法启动方式, local/docker/k8s等 ## 通信引擎池 -### pulsar -```yaml -pulsar: - host: 192.168.0.5 - port: 6650 - mng_port: 8080 - cluster: standalone - tenant: fl-tenant - topic_ttl: 30 - # default conf/pulsar_route_table.yaml - route_table: - # mode: replication / client, default: replication - mode: replication - max_message_size: 1048576 -``` -### nginx: -```yaml -nginx: - host: 127.0.0.1 - http_port: 9300 - grpc_port: 9310 - # http or grpc - protocol: http -``` - -### rabbitmq -```yaml -nginx: - host: 127.0.0.1 - http_port: 9300 - grpc_port: 9310 - # http or grpc - protocol: http -``` - ### osx ```yaml host: 127.0.0.1 @@ -149,7 +114,7 @@ eggroll: ### spark ```yaml -eggroll: +spark: home: cores: 32 ``` diff --git a/examples/dag/lr/train_lr.yaml b/examples/lr/train_lr.yaml similarity index 99% rename from examples/dag/lr/train_lr.yaml rename to examples/lr/train_lr.yaml index 604752a94..1b13597d7 100644 --- a/examples/dag/lr/train_lr.yaml +++ b/examples/lr/train_lr.yaml @@ -201,4 +201,4 @@ dag: - party_id: - '9998' role: host -schema_version: 2.0.0.beta +schema_version: 2.0.0