// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.dataflow.v1beta3;

import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/struct.proto";
import "google/protobuf/timestamp.proto";

option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
option go_package = "cloud.google.com/go/dataflow/apiv1beta3/dataflowpb;dataflowpb";
option java_multiple_files = true;
option java_outer_classname = "MetricsProto";
option java_package = "com.google.dataflow.v1beta3";
option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
option ruby_package = "Google::Cloud::Dataflow::V1beta3";

// The Dataflow Metrics API lets you monitor the progress of Dataflow
// jobs.
service MetricsV1Beta3 {
  option (google.api.default_host) = "dataflow.googleapis.com";
  option (google.api.oauth_scopes) =
      "https://www.googleapis.com/auth/cloud-platform,"
      "https://www.googleapis.com/auth/compute";

  // Request the job status.
  //
  // To request the status of a job, we recommend using
  // `projects.locations.jobs.getMetrics` with a [regional endpoint]
  // (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using
  // `projects.jobs.getMetrics` is not recommended, as you can only request the
  // status of jobs that are running in `us-central1`.
  rpc GetJobMetrics(GetJobMetricsRequest) returns (JobMetrics) {
    option (google.api.http) = {
      get: "/v1b3/projects/{project_id}/locations/{location}/jobs/{job_id}/metrics"
      additional_bindings {
        get: "/v1b3/projects/{project_id}/jobs/{job_id}/metrics"
      }
    };
  }

  // Request detailed information about the execution status of the job.
  //
  // EXPERIMENTAL.  This API is subject to change or removal without notice.
  rpc GetJobExecutionDetails(GetJobExecutionDetailsRequest)
      returns (JobExecutionDetails) {
    option (google.api.http) = {
      get: "/v1b3/projects/{project_id}/locations/{location}/jobs/{job_id}/executionDetails"
    };
  }

  // Request detailed information about the execution status of a stage of the
  // job.
  //
  // EXPERIMENTAL.  This API is subject to change or removal without notice.
  rpc GetStageExecutionDetails(GetStageExecutionDetailsRequest)
      returns (StageExecutionDetails) {
    option (google.api.http) = {
      get: "/v1b3/projects/{project_id}/locations/{location}/jobs/{job_id}/stages/{stage_id}/executionDetails"
    };
  }
}

// Identifies a metric, by describing the source which generated the
// metric.
message MetricStructuredName {
  // Origin (namespace) of metric name. May be blank for user-define metrics;
  // will be "dataflow" for metrics defined by the Dataflow service or SDK.
  string origin = 1;

  // Worker-defined metric name.
  string name = 2;

  // Zero or more labeled fields which identify the part of the job this
  // metric is associated with, such as the name of a step or collection.
  //
  // For example, built-in counters associated with steps will have
  // context['step'] = <step-name>. Counters associated with PCollections
  // in the SDK will have context['pcollection'] = <pcollection-name>.
  map<string, string> context = 3;
}

// Describes the state of a metric.
// Next ID: 14
message MetricUpdate {
  // Name of the metric.
  MetricStructuredName name = 1;

  // Metric aggregation kind.  The possible metric aggregation kinds are
  // "Sum", "Max", "Min", "Mean", "Set", "And", "Or", and "Distribution".
  // The specified aggregation kind is case-insensitive.
  //
  // If omitted, this is not an aggregated value but instead
  // a single metric sample value.
  string kind = 2;

  // True if this metric is reported as the total cumulative aggregate
  // value accumulated since the worker started working on this WorkItem.
  // By default this is false, indicating that this metric is reported
  // as a delta that is not associated with any WorkItem.
  bool cumulative = 3;

  // Worker-computed aggregate value for aggregation kinds "Sum", "Max", "Min",
  // "And", and "Or".  The possible value types are Long, Double, and Boolean.
  google.protobuf.Value scalar = 4;

  // Worker-computed aggregate value for the "Mean" aggregation kind.
  // This holds the sum of the aggregated values and is used in combination
  // with mean_count below to obtain the actual mean aggregate value.
  // The only possible value types are Long and Double.
  google.protobuf.Value mean_sum = 5;

  // Worker-computed aggregate value for the "Mean" aggregation kind.
  // This holds the count of the aggregated values and is used in combination
  // with mean_sum above to obtain the actual mean aggregate value.
  // The only possible value type is Long.
  google.protobuf.Value mean_count = 6;

  // Worker-computed aggregate value for the "Set" aggregation kind.  The only
  // possible value type is a list of Values whose type can be Long, Double,
  // String, or BoundedTrie according to the metric's type.  All Values in the
  // list must be of the same type.
  google.protobuf.Value set = 7;

  // Worker-computed aggregate value for the "Trie" aggregation kind.  The only
  // possible value type is a BoundedTrieNode.
  google.protobuf.Value trie = 13;

  // A struct value describing properties of a distribution of numeric values.
  google.protobuf.Value distribution = 11;

  // A struct value describing properties of a Gauge.
  // Metrics of gauge type show the value of a metric across time, and is
  // aggregated based on the newest value.
  google.protobuf.Value gauge = 12;

  // Worker-computed aggregate value for internal use by the Dataflow
  // service.
  google.protobuf.Value internal = 8;

  // Timestamp associated with the metric value. Optional when workers are
  // reporting work progress; it will be filled in responses from the
  // metrics API.
  google.protobuf.Timestamp update_time = 9;
}

// Request to get job metrics.
message GetJobMetricsRequest {
  // A project id.
  string project_id = 1;

  // The job to get metrics for.
  string job_id = 2;

  // Return only metric data that has changed since this time.
  // Default is to return all information about all metrics for the job.
  google.protobuf.Timestamp start_time = 3;

  // The [regional endpoint]
  // (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that
  // contains the job specified by job_id.
  string location = 4;
}

// JobMetrics contains a collection of metrics describing the detailed progress
// of a Dataflow job. Metrics correspond to user-defined and system-defined
// metrics in the job. For more information, see [Dataflow job metrics]
// (https://cloud.google.com/dataflow/docs/guides/using-monitoring-intf).
//
// This resource captures only the most recent values of each metric;
// time-series data can be queried for them (under the same metric names)
// from Cloud Monitoring.
message JobMetrics {
  // Timestamp as of which metric values are current.
  google.protobuf.Timestamp metric_time = 1;

  // All metrics for this job.
  repeated MetricUpdate metrics = 2;
}

// Request to get job execution details.
message GetJobExecutionDetailsRequest {
  // A project id.
  string project_id = 1;

  // The job to get execution details for.
  string job_id = 2;

  // The [regional endpoint]
  // (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that
  // contains the job specified by job_id.
  string location = 3;

  // If specified, determines the maximum number of stages to
  // return.  If unspecified, the service may choose an appropriate
  // default, or may return an arbitrarily large number of results.
  int32 page_size = 4;

  // If supplied, this should be the value of next_page_token returned
  // by an earlier call. This will cause the next page of results to
  // be returned.
  string page_token = 5;
}

// Information about the progress of some component of job execution.
message ProgressTimeseries {
  // A point in the timeseries.
  message Point {
    // The timestamp of the point.
    google.protobuf.Timestamp time = 1;

    // The value of the point.
    double value = 2;
  }

  // The current progress of the component, in the range [0,1].
  double current_progress = 1;

  // History of progress for the component.
  //
  // Points are sorted by time.
  repeated Point data_points = 2;
}

// Information useful for straggler identification and debugging.
message StragglerInfo {
  // Information useful for debugging a straggler. Each type will provide
  // specialized debugging information relevant for a particular cause.
  // The StragglerDebuggingInfo will be 1:1 mapping to the StragglerCause enum.
  message StragglerDebuggingInfo {
    oneof straggler_debugging_info_value {
      // Hot key debugging details.
      HotKeyDebuggingInfo hot_key = 1;
    }
  }

  // The time when the work item attempt became a straggler.
  google.protobuf.Timestamp start_time = 1;

  // The straggler causes, keyed by the string representation of the
  // StragglerCause enum and contains specialized debugging information for each
  // straggler cause.
  map<string, StragglerDebuggingInfo> causes = 2;
}

// Information useful for streaming straggler identification and debugging.
message StreamingStragglerInfo {
  // Start time of this straggler.
  google.protobuf.Timestamp start_time = 1;

  // End time of this straggler.
  google.protobuf.Timestamp end_time = 2;

  // Name of the worker where the straggler was detected.
  string worker_name = 3;

  // The event-time watermark lag at the time of the straggler detection.
  google.protobuf.Duration data_watermark_lag = 4;

  // The system watermark lag at the time of the straggler detection.
  google.protobuf.Duration system_watermark_lag = 5;
}

// Information for a straggler.
message Straggler {
  // Information useful for straggler identification and debugging.
  oneof straggler_info {
    // Batch straggler identification and debugging information.
    StragglerInfo batch_straggler = 1;

    // Streaming straggler identification and debugging information.
    StreamingStragglerInfo streaming_straggler = 2;
  }
}

// Information useful for debugging a hot key detection.
message HotKeyDebuggingInfo {
  // Information about a hot key.
  message HotKeyInfo {
    // The age of the hot key measured from when it was first detected.
    google.protobuf.Duration hot_key_age = 1;

    // A detected hot key that is causing limited parallelism. This field will
    // be populated only if the following flag is set to true:
    // "--enable_hot_key_logging".
    string key = 2;

    // If true, then the above key is truncated and cannot be deserialized. This
    // occurs if the key above is populated and the key size is >5MB.
    bool key_truncated = 3;
  }

  // Debugging information for each detected hot key. Keyed by a hash of the
  // key.
  map<uint64, HotKeyInfo> detected_hot_keys = 1;
}

// Summarized straggler identification details.
message StragglerSummary {
  // The total count of stragglers.
  int64 total_straggler_count = 1;

  // Aggregated counts of straggler causes, keyed by the string representation
  // of the StragglerCause enum.
  map<string, int64> straggler_cause_count = 2;

  // The most recent stragglers.
  repeated Straggler recent_stragglers = 3;
}

// The state of some component of job execution.
enum ExecutionState {
  // The component state is unknown or unspecified.
  EXECUTION_STATE_UNKNOWN = 0;

  // The component is not yet running.
  EXECUTION_STATE_NOT_STARTED = 1;

  // The component is currently running.
  EXECUTION_STATE_RUNNING = 2;

  // The component succeeded.
  EXECUTION_STATE_SUCCEEDED = 3;

  // The component failed.
  EXECUTION_STATE_FAILED = 4;

  // Execution of the component was cancelled.
  EXECUTION_STATE_CANCELLED = 5;
}

// Information about a particular execution stage of a job.
message StageSummary {
  // ID of this stage
  string stage_id = 1;

  // State of this stage.
  ExecutionState state = 2;

  // Start time of this stage.
  google.protobuf.Timestamp start_time = 3;

  // End time of this stage.
  //
  // If the work item is completed, this is the actual end time of the stage.
  // Otherwise, it is the predicted end time.
  google.protobuf.Timestamp end_time = 4;

  // Progress for this stage.
  // Only applicable to Batch jobs.
  ProgressTimeseries progress = 5;

  // Metrics for this stage.
  repeated MetricUpdate metrics = 6;

  // Straggler summary for this stage.
  StragglerSummary straggler_summary = 7;
}

// Information about the execution of a job.
message JobExecutionDetails {
  // The stages of the job execution.
  repeated StageSummary stages = 1;

  // If present, this response does not contain all requested tasks.  To obtain
  // the next page of results, repeat the request with page_token set to this
  // value.
  string next_page_token = 2;
}

// Request to get information about a particular execution stage of a job.
// Currently only tracked for Batch jobs.
message GetStageExecutionDetailsRequest {
  // A project id.
  string project_id = 1;

  // The job to get execution details for.
  string job_id = 2;

  // The [regional endpoint]
  // (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that
  // contains the job specified by job_id.
  string location = 3;

  // The stage for which to fetch information.
  string stage_id = 4;

  // If specified, determines the maximum number of work items to
  // return.  If unspecified, the service may choose an appropriate
  // default, or may return an arbitrarily large number of results.
  int32 page_size = 5;

  // If supplied, this should be the value of next_page_token returned
  // by an earlier call. This will cause the next page of results to
  // be returned.
  string page_token = 6;

  // Lower time bound of work items to include, by start time.
  google.protobuf.Timestamp start_time = 7;

  // Upper time bound of work items to include, by start time.
  google.protobuf.Timestamp end_time = 8;
}

// Information about an individual work item execution.
message WorkItemDetails {
  // Name of this work item.
  string task_id = 1;

  // Attempt ID of this work item
  string attempt_id = 2;

  // Start time of this work item attempt.
  google.protobuf.Timestamp start_time = 3;

  // End time of this work item attempt.
  //
  // If the work item is completed, this is the actual end time of the work
  // item.  Otherwise, it is the predicted end time.
  google.protobuf.Timestamp end_time = 4;

  // State of this work item.
  ExecutionState state = 5;

  // Progress of this work item.
  ProgressTimeseries progress = 6;

  // Metrics for this work item.
  repeated MetricUpdate metrics = 7;

  // Information about straggler detections for this work item.
  StragglerInfo straggler_info = 8;
}

// Information about a worker
message WorkerDetails {
  // Name of this worker
  string worker_name = 1;

  // Work items processed by this worker, sorted by time.
  repeated WorkItemDetails work_items = 2;
}

// Information about the workers and work items within a stage.
message StageExecutionDetails {
  // Workers that have done work on the stage.
  repeated WorkerDetails workers = 1;

  // If present, this response does not contain all requested tasks.  To obtain
  // the next page of results, repeat the request with page_token set to this
  // value.
  string next_page_token = 2;
}
