// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.dataplex.v1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";

option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb";
option java_multiple_files = true;
option java_outer_classname = "DataDiscoveryProto";
option java_package = "com.google.cloud.dataplex.v1";
option (google.api.resource_definition) = {
  type: "bigquery.googleapis.com/Dataset"
  pattern: "projects/{project}/datasets/{dataset}"
};
option (google.api.resource_definition) = {
  type: "bigqueryconnection.googleapis.com/Connection"
  pattern: "projects/{project}/locations/{location}/connections/{connection}"
};

// Spec for a data discovery scan.
message DataDiscoverySpec {
  // Describes BigQuery publishing configurations.
  message BigQueryPublishingConfig {
    // Determines how discovered tables are published.
    enum TableType {
      // Table type unspecified.
      TABLE_TYPE_UNSPECIFIED = 0;

      // Default. Discovered tables are published as BigQuery external tables
      // whose data is accessed using the credentials of the user querying the
      // table.
      EXTERNAL = 1;

      // Discovered tables are published as BigLake external tables whose data
      // is accessed using the credentials of the associated BigQuery
      // connection.
      BIGLAKE = 2;
    }

    // Optional. Determines whether to  publish discovered tables as BigLake
    // external tables or non-BigLake external tables.
    TableType table_type = 2 [(google.api.field_behavior) = OPTIONAL];

    // Optional. The BigQuery connection used to create BigLake tables.
    // Must be in the form
    // `projects/{project_id}/locations/{location_id}/connections/{connection_id}`
    string connection = 3 [
      (google.api.field_behavior) = OPTIONAL,
      (google.api.resource_reference) = {
        type: "bigqueryconnection.googleapis.com/Connection"
      }
    ];

    // Optional. The location of the BigQuery dataset to publish BigLake
    // external or non-BigLake external tables to.
    // 1. If the Cloud Storage bucket is located in a multi-region bucket, then
    // BigQuery dataset can be in the same multi-region bucket or any single
    // region that is included in the same multi-region bucket. The datascan can
    // be created in any single region that is included in the same multi-region
    // bucket
    // 2. If the Cloud Storage bucket is located in a dual-region bucket, then
    // BigQuery dataset can be located in regions that are included in the
    // dual-region bucket, or in a multi-region that includes the dual-region.
    // The datascan can be created in any single region that is included in the
    // same dual-region bucket.
    // 3. If the Cloud Storage bucket is located in a single region, then
    // BigQuery dataset can be in the same single region or any multi-region
    // bucket that includes the same single region. The datascan will be created
    // in the same single region as the bucket.
    // 4. If the BigQuery dataset is in single region, it must be in the same
    // single region as the datascan.
    //
    // For supported values, refer to
    // https://cloud.google.com/bigquery/docs/locations#supported_locations.
    string location = 4 [(google.api.field_behavior) = OPTIONAL];
  }

  // Configurations related to Cloud Storage as the data source.
  message StorageConfig {
    // Describes CSV and similar semi-structured data formats.
    message CsvOptions {
      // Optional. The number of rows to interpret as header rows that should be
      // skipped when reading data rows.
      int32 header_rows = 1 [(google.api.field_behavior) = OPTIONAL];

      // Optional. The delimiter that is used to separate values. The default is
      // `,` (comma).
      string delimiter = 2 [(google.api.field_behavior) = OPTIONAL];

      // Optional. The character encoding of the data. The default is UTF-8.
      string encoding = 3 [(google.api.field_behavior) = OPTIONAL];

      // Optional. Whether to disable the inference of data types for CSV data.
      // If true, all columns are registered as strings.
      bool type_inference_disabled = 4 [(google.api.field_behavior) = OPTIONAL];

      // Optional. The character used to quote column values. Accepts `"`
      // (double quotation mark) or `'` (single quotation mark). If unspecified,
      // defaults to `"` (double quotation mark).
      string quote = 5 [(google.api.field_behavior) = OPTIONAL];
    }

    // Describes JSON data format.
    message JsonOptions {
      // Optional. The character encoding of the data. The default is UTF-8.
      string encoding = 1 [(google.api.field_behavior) = OPTIONAL];

      // Optional. Whether to disable the inference of data types for JSON data.
      // If true, all columns are registered as their primitive types
      // (strings, number, or boolean).
      bool type_inference_disabled = 2 [(google.api.field_behavior) = OPTIONAL];
    }

    // Optional. Defines the data to include during discovery when only a subset
    // of the data should be considered. Provide a list of patterns that
    // identify the data to include. For Cloud Storage bucket assets, these
    // patterns are interpreted as glob patterns used to match object names. For
    // BigQuery dataset assets, these patterns are interpreted as patterns to
    // match table names.
    repeated string include_patterns = 1
        [(google.api.field_behavior) = OPTIONAL];

    // Optional. Defines the data to exclude during discovery. Provide a list of
    // patterns that identify the data to exclude. For Cloud Storage bucket
    // assets, these patterns are interpreted as glob patterns used to match
    // object names. For BigQuery dataset assets, these patterns are interpreted
    // as patterns to match table names.
    repeated string exclude_patterns = 2
        [(google.api.field_behavior) = OPTIONAL];

    // Optional. Configuration for CSV data.
    CsvOptions csv_options = 3 [(google.api.field_behavior) = OPTIONAL];

    // Optional. Configuration for JSON data.
    JsonOptions json_options = 4 [(google.api.field_behavior) = OPTIONAL];
  }

  // Optional. Configuration for metadata publishing.
  BigQueryPublishingConfig bigquery_publishing_config = 1
      [(google.api.field_behavior) = OPTIONAL];

  // The configurations of the data discovery scan resource.
  oneof resource_config {
    // Cloud Storage related configurations.
    StorageConfig storage_config = 100;
  }
}

// The output of a data discovery scan.
message DataDiscoveryResult {
  // Describes BigQuery publishing configurations.
  message BigQueryPublishing {
    // Output only. The BigQuery dataset the discovered tables are published to.
    string dataset = 1 [
      (google.api.field_behavior) = OUTPUT_ONLY,
      (google.api.resource_reference) = {
        type: "bigquery.googleapis.com/Dataset"
      }
    ];

    // Output only. The location of the BigQuery publishing dataset.
    string location = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  }

  // Describes result statistics of a data scan discovery job.
  message ScanStatistics {
    // The number of files scanned.
    int32 scanned_file_count = 1;

    // The data processed in bytes.
    int64 data_processed_bytes = 2;

    // The number of files excluded.
    int32 files_excluded = 3;

    // The number of tables created.
    int32 tables_created = 4;

    // The number of tables deleted.
    int32 tables_deleted = 5;

    // The number of tables updated.
    int32 tables_updated = 6;

    // The number of filesets created.
    int32 filesets_created = 7;

    // The number of filesets deleted.
    int32 filesets_deleted = 8;

    // The number of filesets updated.
    int32 filesets_updated = 9;
  }

  // Output only. Configuration for metadata publishing.
  BigQueryPublishing bigquery_publishing = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Describes result statistics of a data scan discovery job.
  ScanStatistics scan_statistics = 2
      [(google.api.field_behavior) = OUTPUT_ONLY];
}
