/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the COPYING file, which can be found at the root of the source code       *
 * distribution tree, or in https://www.hdfgroup.org/licenses.               *
 * If you do not have access to either file, you may request a copy from     *
 * help@hdfgroup.org.                                                        *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*
 * Purpose: An initial implementation of a subfiling VFD which is
 *          derived from other "stacked" VFDs such as the splitter,
 *          mirror, and family VFDs.
 */

#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */

#include "H5private.h"          /* Generic Functions        */
#include "H5CXprivate.h"        /* API contexts, etc.       */
#include "H5Dprivate.h"         /* Dataset stuff            */
#include "H5Eprivate.h"         /* Error handling           */
#include "H5FDprivate.h"        /* File drivers             */
#include "H5FDsubfiling.h"      /* Subfiling file driver    */
#include "H5FDsubfiling_priv.h" /* Subfiling file driver    */
#include "H5FDsec2.h"           /* Sec2 VFD                 */
#include "H5FLprivate.h"        /* Free Lists               */
#include "H5Fprivate.h"         /* File access              */
#include "H5Iprivate.h"         /* IDs                      */
#include "H5MMprivate.h"        /* Memory management        */
#include "H5Pprivate.h"         /* Property lists           */

/* The driver identification number, initialized at runtime */
static hid_t H5FD_SUBFILING_g = H5I_INVALID_HID;

/* Whether the driver initialized MPI on its own */
static bool H5FD_mpi_self_initialized = false;

/* The description of a file belonging to this driver. The 'eoa' and 'eof'
 * determine the amount of hdf5 address space in use and the high-water mark
 * of the file (the current size of the underlying filesystem file). The
 * 'pos' value is used to eliminate file position updates when they would be a
 * no-op. Unfortunately we've found systems that use separate file position
 * indicators for reading and writing so the lseek can only be eliminated if
 * the current operation is the same as the previous operation.  When opening
 * a file the 'eof' will be set to the current file size, `eoa' will be set
 * to zero, 'pos' will be set to H5F_ADDR_UNDEF (as it is when an error
 * occurs), and 'op' will be set to H5F_OP_UNKNOWN.
 */
/***************************************************************************
 *
 * Structure: H5FD_subfiling_t
 *
 * Purpose:
 *
 *     H5FD_subfiling_t is a structure used to store all information needed
 *     to setup, manage, and take down subfiling for a HDF5 file.
 *
 *     This structure is created when such a file is "opened" and
 *     discarded when it is "closed".
 *
 *     Presents a system of subfiles as a single file to the HDF5 library.
 *
 *
 * `pub` (H5FD_t)
 *
 *     Instance of H5FD_t which contains all fields common to all VFDs.
 *     It must be the first item in this structure, since at higher levels,
 *     this structure will be treated as an instance of H5FD_t.
 *
 * `fa` (H5FD_subfiling_config_t)
 *
 *     Instance of `H5FD_subfiling_config_t` containing the subfiling
 *     configuration data needed to "open" the HDF5 file.
 *
 *
 *  Document additional subfiling fields here.
 *
 *  Recall that the existing fields are inherited from the sec2 driver
 *  and should be kept or not as appropriate for the sub-filing VFD.
 *
 *
 ***************************************************************************/

typedef struct H5FD_subfiling_t {
    H5FD_t                  pub; /* public stuff, must be first      */
    H5FD_subfiling_config_t fa;  /* driver-specific file access properties */

    /* MPI Info */
    MPI_Comm comm;
    MPI_Comm ext_comm;
    MPI_Info info;
    int      mpi_rank;
    int      mpi_size;

    H5FD_t *sf_file;
    H5FD_t *stub_file;

    uint64_t file_id;
    int64_t  context_id; /* The value used to lookup a subfiling context for the file */

    bool fail_to_encode; /* Used to check for failures from sb_get_size routine */

    char *file_dir;  /* Directory where we find files */
    char *file_path; /* The user defined filename */

    /*
     * The element layouts above this point are identical with the
     * H5FD_ioc_t structure. As a result,
     *
     * Everything which follows is unique to the H5FD_subfiling_t
     */
    haddr_t        eoa;                             /* end of allocated region                    */
    haddr_t        eof;                             /* end of file; current file size             */
    haddr_t        last_eoa;                        /* Last known end-of-address marker           */
    haddr_t        local_eof;                       /* Local end-of-file address for each process */
    haddr_t        pos;                             /* current file I/O position                  */
    H5FD_file_op_t op;                              /* last operation                             */
    char           filename[H5FD_MAX_FILENAME_LEN]; /* Copy of file name from open operation */
} H5FD_subfiling_t;

/*
 * These macros check for overflow of various quantities.  These macros
 * assume that HDoff_t is signed and haddr_t and size_t are unsigned.
 *
 * ADDR_OVERFLOW:   Checks whether a file address of type `haddr_t'
 *                  is too large to be represented by the second argument
 *                  of the file seek function.
 *
 * SIZE_OVERFLOW:   Checks whether a buffer size of type `hsize_t' is too
 *                  large to be represented by the `size_t' type.
 *
 * REGION_OVERFLOW: Checks whether an address and size pair describe data
 *                  which can be addressed entirely by the second
 *                  argument of the file seek function.
 */
#define MAXADDR          (((haddr_t)1 << (8 * sizeof(HDoff_t) - 1)) - 1)
#define ADDR_OVERFLOW(A) (HADDR_UNDEF == (A) || ((A) & ~(haddr_t)MAXADDR))
#define SIZE_OVERFLOW(Z) ((Z) & ~(hsize_t)MAXADDR)
#define REGION_OVERFLOW(A, Z)                                                                                \
    (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) || HADDR_UNDEF == (A) + (Z) || (HDoff_t)((A) + (Z)) < (HDoff_t)(A))

/*
 * NOTE: Must be kept in sync with the private
 * H5F_MAX_DRVINFOBLOCK_SIZE macro value for now
 */
#define H5FD_SUBFILING_MAX_DRV_INFO_SIZE 1024

/* Prototypes */
static herr_t  H5FD__subfiling_term(void);
static hsize_t H5FD__subfiling_sb_size(H5FD_t *_file);
static herr_t  H5FD__subfiling_sb_encode(H5FD_t *_file, char *name, unsigned char *buf);
static herr_t  H5FD__subfiling_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf);
static void   *H5FD__subfiling_fapl_get(H5FD_t *_file);
static void   *H5FD__subfiling_fapl_copy(const void *_old_fa);
static herr_t  H5FD__subfiling_fapl_free(void *_fa);
static H5FD_t *H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr);
static herr_t  H5FD__subfiling_close(H5FD_t *_file);
static int     H5FD__subfiling_cmp(const H5FD_t *_f1, const H5FD_t *_f2);
static herr_t  H5FD__subfiling_query(const H5FD_t *_f1, unsigned long *flags);
static haddr_t H5FD__subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
static herr_t  H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
static haddr_t H5FD__subfiling_get_eof(const H5FD_t *_file, H5FD_mem_t type);
static herr_t  H5FD__subfiling_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle);
static herr_t  H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, size_t size,
                                    void *buf);
static herr_t  H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
                                     const void *buf);
static herr_t  H5FD__subfiling_read_vector(H5FD_t *file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[],
                                           haddr_t addrs[], size_t sizes[], void *bufs[] /* out */);
static herr_t  H5FD__subfiling_write_vector(H5FD_t *file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[],
                                            haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */);
static herr_t  H5FD__subfiling_truncate(H5FD_t *_file, hid_t dxpl_id, bool closing);
#if 0
static herr_t  H5FD__subfiling_lock(H5FD_t *_file, bool rw);
static herr_t  H5FD__subfiling_unlock(H5FD_t *_file);
#endif
static herr_t H5FD__subfiling_del(const char *name, hid_t fapl);
static herr_t H5FD__subfiling_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input,
                                  void **output);

static herr_t H5FD__subfiling_get_default_config(hid_t fapl_id, H5FD_subfiling_config_t *config_out);
static herr_t H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa);
static int    H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr);

static herr_t H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr);

static herr_t init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_nelemts,
                            size_t dtype_extent, size_t max_iovec_len, int64_t *mem_buf_offset,
                            int64_t *target_file_offset, int64_t *io_block_len, int *first_subfile_index,
                            int *n_subfiles_used, int64_t *max_io_req_per_subfile);
static herr_t iovec_fill_first(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
                               int64_t start_mem_offset, int64_t start_file_offset, int64_t first_io_len,
                               int64_t *mem_offset_out, int64_t *target_file_offset_out,
                               int64_t *io_block_len_out);
static herr_t iovec_fill_last(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
                              int64_t start_mem_offset, int64_t start_file_offset, int64_t last_io_len,
                              int64_t *mem_offset_out, int64_t *target_file_offset_out,
                              int64_t *io_block_len_out);
static herr_t iovec_fill_first_last(subfiling_context_t *sf_context, int64_t iovec_depth,
                                    int64_t target_datasize, int64_t start_mem_offset,
                                    int64_t start_file_offset, int64_t first_io_len, int64_t last_io_len,
                                    int64_t *mem_offset_out, int64_t *target_file_offset_out,
                                    int64_t *io_block_len_out);
static herr_t iovec_fill_uniform(subfiling_context_t *sf_context, int64_t iovec_depth,
                                 int64_t target_datasize, int64_t start_mem_offset, int64_t start_file_offset,
                                 int64_t *mem_offset_out, int64_t *target_file_offset_out,
                                 int64_t *io_block_len_out);

void H5FD__subfiling_mpi_finalize(void);

static const H5FD_class_t H5FD_subfiling_g = {
    H5FD_CLASS_VERSION,                /* VFD interface version */
    H5_VFD_SUBFILING,                  /* value                 */
    H5FD_SUBFILING_NAME,               /* name                  */
    MAXADDR,                           /* maxaddr               */
    H5F_CLOSE_WEAK,                    /* fc_degree             */
    H5FD__subfiling_term,              /* terminate             */
    H5FD__subfiling_sb_size,           /* sb_size               */
    H5FD__subfiling_sb_encode,         /* sb_encode             */
    H5FD__subfiling_sb_decode,         /* sb_decode             */
    sizeof(H5FD_subfiling_config_t),   /* fapl_size             */
    H5FD__subfiling_fapl_get,          /* fapl_get              */
    H5FD__subfiling_fapl_copy,         /* fapl_copy             */
    H5FD__subfiling_fapl_free,         /* fapl_free             */
    0,                                 /* dxpl_size             */
    NULL,                              /* dxpl_copy             */
    NULL,                              /* dxpl_free             */
    H5FD__subfiling_open,              /* open                  */
    H5FD__subfiling_close,             /* close                 */
    H5FD__subfiling_cmp,               /* cmp                   */
    H5FD__subfiling_query,             /* query                 */
    NULL,                              /* get_type_map          */
    NULL,                              /* alloc                 */
    NULL,                              /* free                  */
    H5FD__subfiling_get_eoa,           /* get_eoa               */
    H5FD__subfiling_set_eoa,           /* set_eoa               */
    H5FD__subfiling_get_eof,           /* get_eof               */
    H5FD__subfiling_get_handle,        /* get_handle            */
    H5FD__subfiling_read,              /* read                  */
    H5FD__subfiling_write,             /* write                 */
    H5FD__subfiling_read_vector,       /* read_vector           */
    H5FD__subfiling_write_vector,      /* write_vector          */
    NULL,                              /* read_selection        */
    NULL,                              /* write_selection       */
    NULL,                              /* flush                 */
    H5FD__subfiling_truncate,          /* truncate              */
    NULL /* H5FD__subfiling_lock */,   /* lock                  */
    NULL /* H5FD__subfiling_unlock */, /* unlock                */
    H5FD__subfiling_del,               /* del                   */
    H5FD__subfiling_ctl,               /* ctl                   */
    H5FD_FLMAP_DICHOTOMY               /* fl_map                */
};

/* Declare a free list to manage the H5FD_subfiling_t struct */
H5FL_DEFINE_STATIC(H5FD_subfiling_t);

/*
 * If this VFD initialized MPI, this routine will be registered
 * as an atexit handler in order to finalize MPI before the
 * application exits.
 */
void
H5FD__subfiling_mpi_finalize(void)
{
    H5close();
    MPI_Finalize();
}

/*-------------------------------------------------------------------------
 * Function:    H5FD_subfiling_init
 *
 * Purpose:     Initialize this driver by registering the driver with the
 *              library.
 *
 * Return:      Success:    The driver ID for the subfiling driver
 *              Failure:    H5I_INVALID_HID
 *
 *-------------------------------------------------------------------------
 */
hid_t
H5FD_subfiling_init(void)
{
    hid_t ret_value = H5I_INVALID_HID; /* Return value */

    /* Register the Subfiling VFD, if it isn't already registered */
    if (H5I_VFL != H5I_get_type(H5FD_SUBFILING_g)) {
        int mpi_initialized = 0;
        int provided        = 0;
        int mpi_code;

        if ((H5FD_SUBFILING_g = H5FD_register(&H5FD_subfiling_g, sizeof(H5FD_class_t), false)) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_ID, H5E_CANTREGISTER, H5I_INVALID_HID,
                                    "can't register subfiling VFD");

        /* Initialize error reporting */
        if ((H5subfiling_err_stack_g = H5Ecreate_stack()) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID, "can't create HDF5 error stack");
        if ((H5subfiling_err_class_g = H5Eregister_class(H5SUBFILING_ERR_CLS_NAME, H5SUBFILING_ERR_LIB_NAME,
                                                         H5SUBFILING_ERR_VER)) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID,
                                    "can't register error class with HDF5 error API");

        /* Initialize MPI if not already initialized */
        if (MPI_SUCCESS != (mpi_code = MPI_Initialized(&mpi_initialized)))
            H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Initialized failed", mpi_code);
        if (mpi_initialized) {
            /* If MPI is initialized, validate that it was initialized with MPI_THREAD_MULTIPLE */
            if (MPI_SUCCESS != (mpi_code = MPI_Query_thread(&provided)))
                H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Query_thread failed", mpi_code);
            if (provided != MPI_THREAD_MULTIPLE)
                H5_SUBFILING_GOTO_ERROR(
                    H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID,
                    "Subfiling VFD requires the use of MPI_Init_thread with MPI_THREAD_MULTIPLE");
        }
        else {
            int required = MPI_THREAD_MULTIPLE;

            if (MPI_SUCCESS != (mpi_code = MPI_Init_thread(NULL, NULL, required, &provided)))
                H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Init_thread failed", mpi_code);

            H5FD_mpi_self_initialized = true;

            if (provided != required)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID,
                                        "MPI doesn't support MPI_Init_thread with MPI_THREAD_MULTIPLE");

            if (atexit(H5FD__subfiling_mpi_finalize) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID,
                                        "can't register atexit handler for MPI_Finalize");
        }

        /*
         * Create the MPI Datatype that will be used
         * for sending/receiving RPC messages
         */
        HDcompile_assert(sizeof(((sf_work_request_t *)NULL)->header) == 3 * sizeof(int64_t));
        if (H5_subfiling_rpc_msg_type == MPI_DATATYPE_NULL) {
            if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous(3, MPI_INT64_T, &H5_subfiling_rpc_msg_type)))
                H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Type_contiguous failed", mpi_code);
            if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&H5_subfiling_rpc_msg_type)))
                H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Type_commit failed", mpi_code);
        }
    }

    /* Set return value */
    ret_value = H5FD_SUBFILING_g;

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD_subfiling_init() */

/*---------------------------------------------------------------------------
 * Function:    H5FD__subfiling_term
 *
 * Purpose:     Shut down the VFD
 *
 * Returns:     SUCCEED (Can't fail)
 *
 *---------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_term(void)
{
    herr_t ret_value = SUCCEED;

    if (H5FD_SUBFILING_g >= 0) {
        int mpi_finalized;
        int mpi_code;

        /*
         * Retrieve status of whether MPI has already been terminated.
         * This can happen if an HDF5 ID is left unclosed and HDF5
         * shuts down after MPI_Finalize() is called in an application.
         */
        if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
            H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);

        /* Free RPC message MPI Datatype */
        if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL) {
            if (!mpi_finalized) {
                if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type)))
                    H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
            }
#ifdef H5FD_SUBFILING_DEBUG
            else
                printf("** WARNING **: HDF5 is terminating the Subfiling VFD after MPI_Finalize() was "
                       "called - an HDF5 ID was probably left unclosed\n");
#endif
        }

        /* Clean up resources */
        if (H5_subfiling_terminate() < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL,
                                    "can't cleanup internal subfiling resources");

        /* Unregister from HDF5 error API */
        if (H5subfiling_err_class_g >= 0) {
            if (H5Eunregister_class(H5subfiling_err_class_g) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CLOSEERROR, FAIL,
                                        "can't unregister error class from HDF5 error API");
        }
        if (H5subfiling_err_stack_g >= 0) {
            /* Print the current error stack before destroying it */
            PRINT_ERROR_STACK;

            /* Destroy the error stack */
            if (H5Eclose_stack(H5subfiling_err_stack_g) < 0) {
                H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CLOSEERROR, FAIL, "can't close HDF5 error stack");
                PRINT_ERROR_STACK;
            } /* end if */

            H5subfiling_err_stack_g = H5I_INVALID_HID;
            H5subfiling_err_class_g = H5I_INVALID_HID;
        }
    }

done:
    /* Reset VFL ID */
    H5FD_SUBFILING_g = H5I_INVALID_HID;

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_term() */

/*-------------------------------------------------------------------------
 * Function:    H5Pset_fapl_subfiling
 *
 * Purpose:     Modify the file access property list to use the
 *              H5FD_SUBFILING driver defined in this source file.  All
 *              driver specific properties are passed in as a pointer to
 *              a suitably initialized instance of H5FD_subfiling_config_t.
 *              If NULL is passed for the H5FD_subfiling_config_t
 *              structure, a default structure will be used instead.
 *
 * Return:      SUCCEED/FAIL
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pset_fapl_subfiling(hid_t fapl_id, const H5FD_subfiling_config_t *vfd_config)
{
    H5FD_subfiling_config_t *subfiling_conf = NULL;
    H5P_genplist_t          *plist          = NULL;
    H5P_genplist_t          *ioc_plist      = NULL;
    MPI_Comm                 comm           = MPI_COMM_NULL;
    MPI_Info                 info           = MPI_INFO_NULL;
    herr_t                   ret_value      = SUCCEED;

    /*NO TRACE*/

    /* Ensure Subfiling (and therefore MPI) is initialized before doing anything */
    if (H5FD_subfiling_init() < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "can't initialize subfiling VFD");

    if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");

    if (vfd_config == NULL) {
        if (NULL == (subfiling_conf = calloc(1, sizeof(*subfiling_conf))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate subfiling VFD configuration");
        subfiling_conf->ioc_fapl_id = H5I_INVALID_HID;

        /* Get subfiling VFD defaults */
        if (H5FD__subfiling_get_default_config(fapl_id, subfiling_conf) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,
                                    "can't get default subfiling VFD configuration");

        vfd_config = subfiling_conf;
    }

    /* Check if any MPI parameters were set on the FAPL */
    if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI communicator from plist");
    if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI info from plist");
    if (comm == MPI_COMM_NULL)
        comm = MPI_COMM_WORLD;

    /* Set MPI parameters on IOC FAPL */
    if (NULL == (ioc_plist = H5P_object_verify(vfd_config->ioc_fapl_id, H5P_FILE_ACCESS)))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");
    if (H5P_set(ioc_plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator on plist");
    if (H5P_set(ioc_plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info on plist");

    if (H5FD__subfiling_validate_config(vfd_config) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling VFD configuration");

    /* Set Subfiling configuration on IOC FAPL */
    if (H5_subfiling_set_config_prop(ioc_plist, &vfd_config->shared_cfg) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,
                                "can't set subfiling configuration on IOC FAPL");

    ret_value = H5P_set_driver(plist, H5FD_SUBFILING, vfd_config, NULL);

done:
    if (H5_mpi_comm_free(&comm) < 0)
        H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Communicator");
    if (H5_mpi_info_free(&info) < 0)
        H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Info object");

    if (subfiling_conf) {
        if (subfiling_conf->ioc_fapl_id >= 0 && H5I_dec_ref(subfiling_conf->ioc_fapl_id) < 0)
            H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTDEC, FAIL, "can't close IOC FAPL");
        free(subfiling_conf);
    }

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5Pset_fapl_subfiling() */

/*-------------------------------------------------------------------------
 * Function:    H5Pget_fapl_subfiling
 *
 * Purpose:     Returns information about the subfiling file access
 *              property list though the function arguments.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pget_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_config_t *config_out)
{
    const H5FD_subfiling_config_t *config_ptr         = NULL;
    H5P_genplist_t                *plist              = NULL;
    bool                           use_default_config = false;
    herr_t                         ret_value          = SUCCEED;

    /*NO TRACE*/

    if (config_out == NULL)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "config_out is NULL");

    if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");

    if (H5FD_SUBFILING != H5P_peek_driver(plist))
        use_default_config = true;
    else {
        config_ptr = H5P_peek_driver_info(plist);
        if (NULL == config_ptr)
            use_default_config = true;
    }

    if (use_default_config) {
        if (H5FD__subfiling_get_default_config(fapl_id, config_out) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
                                    "can't get default Subfiling VFD configuration");
    }
    else {
        /* Copy the subfiling fapl data out */
        H5MM_memcpy(config_out, config_ptr, sizeof(H5FD_subfiling_config_t));

        /* Copy the driver info value */
        if (H5FD__copy_plist(config_ptr->ioc_fapl_id, &(config_out->ioc_fapl_id)) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't copy IOC FAPL");
    }

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5Pget_fapl_subfiling() */

static herr_t
H5FD__subfiling_get_default_config(hid_t fapl_id, H5FD_subfiling_config_t *config_out)
{
    MPI_Comm comm = MPI_COMM_NULL;
    MPI_Info info = MPI_INFO_NULL;
    char    *h5_require_ioc;
    herr_t   ret_value = SUCCEED;

    assert(config_out);

    memset(config_out, 0, sizeof(*config_out));

    config_out->magic       = H5FD_SUBFILING_FAPL_MAGIC;
    config_out->version     = H5FD_SUBFILING_CURR_FAPL_VERSION;
    config_out->ioc_fapl_id = H5I_INVALID_HID;
    config_out->require_ioc = true;

    config_out->shared_cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE;
    config_out->shared_cfg.stripe_size   = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
    config_out->shared_cfg.stripe_count  = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT;

    if ((h5_require_ioc = getenv("H5_REQUIRE_IOC")) != NULL) {
        int value_check = atoi(h5_require_ioc);
        if (value_check == 0)
            config_out->require_ioc = false;
    }

    /* Check if any MPI parameters were set on the FAPL */
    if (H5Pget_mpi_params(fapl_id, &comm, &info) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI Comm/Info");
    if (comm == MPI_COMM_NULL) {
        comm = MPI_COMM_WORLD;

        /* Set MPI_COMM_WORLD on FAPL if no MPI parameters were set */
        if (H5Pset_mpi_params(fapl_id, comm, info) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI Comm/Info");
    }

    /* Create a default FAPL and choose an appropriate underlying driver */
    if ((config_out->ioc_fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTCREATE, FAIL, "can't create default FAPL");

    if (config_out->require_ioc) {
        if (H5Pset_mpi_params(config_out->ioc_fapl_id, comm, info) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't get MPI Comm/Info on IOC FAPL");

        if (H5Pset_fapl_ioc(config_out->ioc_fapl_id, NULL) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set IOC VFD on IOC FAPL");
    }
    else {
        if (H5Pset_fapl_sec2(config_out->ioc_fapl_id) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set Sec2 VFD on IOC FAPL");
    }

done:
    if (H5_mpi_comm_free(&comm) < 0)
        H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Communicator");
    if (H5_mpi_info_free(&info) < 0)
        H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Info object");

    if (ret_value < 0) {
        if (config_out->ioc_fapl_id >= 0 && H5Pclose(config_out->ioc_fapl_id) < 0)
            H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, FAIL, "can't close FAPL");
        config_out->ioc_fapl_id = H5I_INVALID_HID;
    }

    H5_SUBFILING_FUNC_LEAVE;
}

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_validate_config()
 *
 * Purpose:     Test to see if the supplied instance of
 *              H5FD_subfiling_config_t contains internally consistent data.
 *              Return SUCCEED if so, and FAIL otherwise.
 *
 *              Note the difference between internally consistent and
 *              correct.  As we will have to try to setup subfiling to
 *              determine whether the supplied data is correct,
 *              we will settle for internal consistency at this point
 *
 * Return:      SUCCEED if instance of H5FD_subfiling_config_t contains
 *              internally consistent data, FAIL otherwise.
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa)
{
    herr_t ret_value = SUCCEED;

    assert(fa != NULL);

    if (fa->version != H5FD_SUBFILING_CURR_FAPL_VERSION)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "unknown H5FD_subfiling_config_t version");

    if (fa->magic != H5FD_SUBFILING_FAPL_MAGIC)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid H5FD_subfiling_config_t magic value");

    if (fa->ioc_fapl_id < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC FAPL ID");

    if (!fa->require_ioc)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "Subfiling VFD currently always requires IOC VFD to be used");

    if (H5_subfiling_validate_config(&fa->shared_cfg) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration parameters");

done:
    H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__subfiling_validate_config() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_sb_size
 *
 * Purpose:     Returns the size of the subfiling configuration information
 *              to be stored in the superblock.
 *
 * Return:      Size of subfiling configuration information (never fails)
 *-------------------------------------------------------------------------
 */
static hsize_t
H5FD__subfiling_sb_size(H5FD_t *_file)
{
    subfiling_context_t *sf_context = NULL;
    H5FD_subfiling_t    *file       = (H5FD_subfiling_t *)_file;
    hsize_t              ret_value  = 0;

    assert(file);

    /* Configuration structure magic number */
    ret_value += sizeof(uint32_t);

    /* Configuration structure version number */
    ret_value += sizeof(uint32_t);

    /* "Require IOC" field */
    ret_value += sizeof(int32_t);

    /* Subfiling stripe size */
    ret_value += sizeof(int64_t);

    /* Subfiling stripe count (encoded as int64_t for future) */
    ret_value += sizeof(int64_t);

    /* Subfiling config file prefix string length */
    ret_value += sizeof(uint64_t);

    /*
     * Since this callback currently can't return any errors, we
     * will set the "fail to encode" flag on the file if we fail
     * to retrieve the context object here so we can check for
     * errors later.
     */
    if (NULL == (sf_context = H5_get_subfiling_object(file->context_id))) {
        file->fail_to_encode = true;
    }
    else {
        if (sf_context->config_file_prefix) {
            ret_value += strlen(sf_context->config_file_prefix) + 1;
        }
    }

    /* Add superblock information from IOC file if necessary */
    if (file->sf_file) {
        /* Encode the IOC's name into the subfiling information */
        ret_value += 9;

        ret_value += H5FD_sb_size(file->sf_file);
    }

    /*
     * Since the library doesn't currently properly check this,
     * set the "fail to encode" flag if the message size is
     * larger than the library's currently accepted max message
     * size so that we don't try to encode the message and overrun
     * a buffer.
     */
    if (ret_value > H5FD_SUBFILING_MAX_DRV_INFO_SIZE)
        file->fail_to_encode = true;

    H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__subfiling_sb_size() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_sb_encode
 *
 * Purpose:     Encodes the subfiling configuration information into the
 *              specified buffer.
 *
 * Return:      Non-negative on success/Negative on failure
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_sb_encode(H5FD_t *_file, char *name, unsigned char *buf)
{
    subfiling_context_t *sf_context = NULL;
    H5FD_subfiling_t    *file       = (H5FD_subfiling_t *)_file;
    uint8_t             *p          = (uint8_t *)buf;
    uint64_t             tmpu64;
    int64_t              tmp64;
    int32_t              tmp32;
    size_t               prefix_len = 0;
    herr_t               ret_value  = SUCCEED;

    /* Check if the "fail to encode flag" is set */
    if (file->fail_to_encode)
        H5_SUBFILING_GOTO_ERROR(
            H5E_VFL, H5E_CANTENCODE, FAIL,
            "can't encode subfiling driver info message - message was too large or internal error occurred");

    if (NULL == (sf_context = H5_get_subfiling_object(file->context_id)))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object");

    /* Encode driver name */
    strncpy(name, "Subfilin", 9);
    name[8] = '\0';

    /* Encode configuration structure magic number */
    UINT32ENCODE(p, file->fa.magic);

    /* Encode configuration structure version number */
    UINT32ENCODE(p, file->fa.version);

    /* Encode "require IOC" field */
    tmp32 = (int32_t)file->fa.require_ioc;
    INT32ENCODE(p, tmp32);

    /* Encode subfiling stripe size */
    INT64ENCODE(p, sf_context->sf_stripe_size);

    /* Encode subfiling stripe count (number of subfiles) */
    tmp64 = sf_context->sf_num_subfiles;
    INT64ENCODE(p, tmp64);

    /* Encode config file prefix string length */
    if (sf_context->config_file_prefix) {
        prefix_len = strlen(sf_context->config_file_prefix) + 1;
        H5_CHECKED_ASSIGN(tmpu64, uint64_t, prefix_len, size_t);
    }
    else
        tmpu64 = 0;
    UINT64ENCODE(p, tmpu64);

    /* Encode config file prefix string */
    if (sf_context->config_file_prefix) {
        H5MM_memcpy(p, sf_context->config_file_prefix, prefix_len);
        p += prefix_len;
    }

    /* Encode IOC VFD configuration information if necessary */
    if (file->sf_file) {
        char ioc_name[9];

        memset(ioc_name, 0, sizeof(ioc_name));

        if (H5FD_sb_encode(file->sf_file, ioc_name, p + 9) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTENCODE, FAIL,
                                    "unable to encode IOC VFD's superblock information");

        /* Copy the IOC VFD's name into our buffer */
        H5MM_memcpy(p, ioc_name, 9);
    }

done:
    H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__subfiling_sb_encode() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_sb_decode
 *
 * Purpose:     Decodes the subfiling configuration information from the
 *              specified buffer.
 *
 * Return:      Non-negative on success/Negative on failure
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf)
{
    subfiling_context_t *sf_context = NULL;
    H5FD_subfiling_t    *file       = (H5FD_subfiling_t *)_file;
    const uint8_t       *p          = (const uint8_t *)buf;
    uint64_t             tmpu64;
    int64_t              tmp64;
    int32_t              tmp32;
    herr_t               ret_value = SUCCEED;

    /* Check if we previously failed to encode the info */
    if (file->fail_to_encode)
        H5_SUBFILING_GOTO_ERROR(
            H5E_VFL, H5E_CANTDECODE, FAIL,
            "can't decode subfiling driver info message - message wasn't encoded (or encoded improperly)");

    if (NULL == (sf_context = H5_get_subfiling_object(file->context_id)))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object");

    if (strncmp(name, "Subfilin", 9))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid driver name in superblock");

    /* Decode configuration structure magic number */
    UINT32DECODE(p, file->fa.magic);

    /* Decode configuration structure version number */
    UINT32DECODE(p, file->fa.version);

    /* Decode "require IOC" field */
    INT32DECODE(p, tmp32);
    file->fa.require_ioc = (bool)tmp32;

    /* Decode subfiling stripe size */
    INT64DECODE(p, file->fa.shared_cfg.stripe_size);

    /* Decode subfiling stripe count */
    INT64DECODE(p, tmp64);
    H5_CHECK_OVERFLOW(tmp64, int64_t, int32_t);
    file->fa.shared_cfg.stripe_count = (int32_t)tmp64;

    /* Decode config file prefix string length */
    UINT64DECODE(p, tmpu64);

    /* Decode config file prefix string */
    if (tmpu64 > 0) {
        if (!sf_context->config_file_prefix) {
            if (NULL == (sf_context->config_file_prefix = malloc(tmpu64)))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate space for config file prefix string");

            H5MM_memcpy(sf_context->config_file_prefix, p, tmpu64);

            /* Just in case... */
            sf_context->config_file_prefix[tmpu64 - 1] = '\0';
        }

        p += tmpu64;
    }

    if (file->sf_file) {
        char ioc_name[9];

        H5MM_memcpy(ioc_name, p, 9);
        p += 9;

        if (H5FD_sb_load(file->sf_file, ioc_name, p) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDECODE, FAIL,
                                    "unable to decode IOC VFD's superblock information");
    }

    /* Validate the decoded configuration */
    if (H5FD__subfiling_validate_config(&file->fa) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
                                "decoded subfiling configuration info is invalid");

    if (file->fa.shared_cfg.stripe_size != sf_context->sf_stripe_size)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
                                "specified subfiling stripe size (%" PRId64
                                ") doesn't match value stored in file (%" PRId64 ")",
                                sf_context->sf_stripe_size, file->fa.shared_cfg.stripe_size);

    if (file->fa.shared_cfg.stripe_count != sf_context->sf_num_subfiles)
        H5_SUBFILING_GOTO_ERROR(
            H5E_VFL, H5E_BADVALUE, FAIL,
            "specified subfiling stripe count (%d) doesn't match value stored in file (%" PRId32 ")",
            sf_context->sf_num_subfiles, file->fa.shared_cfg.stripe_count);

done:
    H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__subfiling_sb_decode() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_fapl_get
 *
 * Purpose:     Gets a file access property list which could be used to
 *              create an identical file.
 *
 * Return:      Success:        Ptr to new file access property list value.
 *
 *              Failure:        NULL
 *
 *-------------------------------------------------------------------------
 */
static void *
H5FD__subfiling_fapl_get(H5FD_t *_file)
{
    H5FD_subfiling_t        *file      = (H5FD_subfiling_t *)_file;
    H5FD_subfiling_config_t *fa        = NULL;
    void                    *ret_value = NULL;

    fa = (H5FD_subfiling_config_t *)H5MM_calloc(sizeof(H5FD_subfiling_config_t));

    if (fa == NULL) {
        H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");
    }

    /* Copy the fields of the structure */
    H5MM_memcpy(fa, &(file->fa), sizeof(H5FD_subfiling_config_t));

    /* Copy the driver info value */
    if (H5FD__copy_plist(file->fa.ioc_fapl_id, &(fa->ioc_fapl_id)) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy IOC FAPL");

    /* Set return value */
    ret_value = fa;

done:
    if (ret_value == NULL) {

        if (fa != NULL) {
            H5MM_xfree(fa);
        }
    }

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_fapl_get() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__copy_plist
 *
 * Purpose:     Sanity-wrapped H5P_copy_plist() for each channel.
 *              Utility function for operation in multiple locations.
 *
 * Return:      0 on success, -1 on error.
 *-------------------------------------------------------------------------
 */
/* TODO: no need for this function */
static int
H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr)
{
    int             ret_value = 0;
    H5P_genplist_t *plist_ptr = NULL;

    assert(id_out_ptr != NULL);

    if (false == H5P_isa_class(fapl_id, H5P_FILE_ACCESS))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "not a file access property list");

    plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id);
    if (NULL == plist_ptr)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "unable to get property list");

    *id_out_ptr = H5P_copy_plist(plist_ptr, false);
    if (H5I_INVALID_HID == *id_out_ptr)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADTYPE, -1, "unable to copy file access property list");

done:
    H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__copy_plist() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_fapl_copy
 *
 * Purpose:     Copies the subfiling-specific file access properties.
 *
 * Return:      Success:        Ptr to a new property list
 *
 *              Failure:        NULL
 *
 *-------------------------------------------------------------------------
 */
static void *
H5FD__subfiling_fapl_copy(const void *_old_fa)
{
    const H5FD_subfiling_config_t *old_fa    = (const H5FD_subfiling_config_t *)_old_fa;
    H5FD_subfiling_config_t       *new_fa    = NULL;
    void                          *ret_value = NULL;

    new_fa = (H5FD_subfiling_config_t *)H5MM_malloc(sizeof(H5FD_subfiling_config_t));
    if (new_fa == NULL) {
        H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");
    }

    H5MM_memcpy(new_fa, old_fa, sizeof(H5FD_subfiling_config_t));

    if (H5FD__copy_plist(old_fa->ioc_fapl_id, &(new_fa->ioc_fapl_id)) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy the IOC FAPL");

    ret_value = new_fa;

done:
    if (ret_value == NULL) {

        if (new_fa != NULL) {
            H5MM_xfree(new_fa);
        }
    }

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_fapl_copy() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_fapl_free
 *
 * Purpose:     Frees the subfiling-specific file access properties.
 *
 * Return:      SUCCEED (cannot fail)
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_fapl_free(void *_fa)
{
    H5FD_subfiling_config_t *fa        = (H5FD_subfiling_config_t *)_fa;
    herr_t                   ret_value = SUCCEED;

    assert(fa != NULL); /* sanity check */

    if (fa->ioc_fapl_id >= 0 && H5I_dec_ref(fa->ioc_fapl_id) < 0)
        H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTDEC, FAIL, "can't close IOC FAPL");
    fa->ioc_fapl_id = H5I_INVALID_HID;

    H5MM_xfree(fa);

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_fapl_free() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_open
 *
 * Purpose:     Create and/or opens a file as an HDF5 file.
 *
 * Return:      Success:    A pointer to a new file data structure. The
 *                          public fields will be initialized by the
 *                          caller, which is always H5FD_open().
 *              Failure:    NULL
 *
 *-------------------------------------------------------------------------
 */
static H5FD_t *
H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
{
    H5FD_subfiling_t              *file_ptr   = NULL; /* Subfiling VFD info */
    const H5FD_subfiling_config_t *config_ptr = NULL; /* Driver-specific property list */
    H5FD_subfiling_config_t        default_config;
    H5FD_class_t                  *driver    = NULL; /* VFD for file */
    H5P_genplist_t                *plist_ptr = NULL;
    H5FD_driver_prop_t             driver_prop; /* Property for driver ID & info */
    bool                           bcasted_eof = false;
    int64_t                        sf_eof      = -1;
    int                            mpi_code; /* MPI return code */
    H5FD_t                        *ret_value = NULL;

    /* Check arguments */
    if (!name || !*name)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name");
    if (0 == maxaddr || HADDR_UNDEF == maxaddr)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr");
    if (ADDR_OVERFLOW(maxaddr))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr");

    if (NULL == (file_ptr = (H5FD_subfiling_t *)H5FL_CALLOC(H5FD_subfiling_t)))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct");
    file_ptr->comm           = MPI_COMM_NULL;
    file_ptr->info           = MPI_INFO_NULL;
    file_ptr->file_id        = UINT64_MAX;
    file_ptr->context_id     = -1;
    file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID;
    file_ptr->ext_comm       = MPI_COMM_NULL;
    file_ptr->fail_to_encode = false;

    /* Get the driver-specific file access properties */
    if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id)))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list");

    if (H5FD_mpi_self_initialized) {
        file_ptr->comm = MPI_COMM_WORLD;
        file_ptr->info = MPI_INFO_NULL;
    }
    else {
        /* Get the MPI communicator and info object from the property list */
        if (H5P_get(plist_ptr, H5F_ACS_MPI_PARAMS_COMM_NAME, &file_ptr->comm) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator");
        if (H5P_get(plist_ptr, H5F_ACS_MPI_PARAMS_INFO_NAME, &file_ptr->info) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object");

        if (file_ptr->comm == MPI_COMM_NULL)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "invalid or unset MPI communicator in FAPL");
    }

    /* Get the MPI rank of this process and the total number of processes */
    if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_ptr->comm, &file_ptr->mpi_rank)))
        H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code);
    if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_ptr->comm, &file_ptr->mpi_size)))
        H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code);

    /* Work around an HDF5 metadata cache bug with distributed metadata writes when MPI size == 1 */
    if (file_ptr->mpi_size == 1) {
        H5AC_cache_config_t mdc_config;

        /* Get the current initial metadata cache resize configuration */
        if (H5P_get(plist_ptr, H5F_ACS_META_CACHE_INIT_CONFIG_NAME, &mdc_config) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get metadata cache initial config");
        mdc_config.metadata_write_strategy = H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY;
        if (H5P_set(plist_ptr, H5F_ACS_META_CACHE_INIT_CONFIG_NAME, &mdc_config) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, NULL, "can't set metadata cache initial config");
    }

    config_ptr = H5P_peek_driver_info(plist_ptr);
    if (!config_ptr || (H5P_FILE_ACCESS_DEFAULT == fapl_id)) {
        if (H5FD__subfiling_get_default_config(fapl_id, &default_config) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL,
                                    "can't get default subfiling VFD configuration");
        config_ptr = &default_config;
    }

    H5MM_memcpy(&file_ptr->fa, config_ptr, sizeof(H5FD_subfiling_config_t));
    if (H5FD__copy_plist(config_ptr->ioc_fapl_id, &(file_ptr->fa.ioc_fapl_id)) < 0) {
        file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID;
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy FAPL");
    }

    /* Check the "native" driver (IOC/sec2/etc.) */
    if (NULL == (plist_ptr = H5I_object(file_ptr->fa.ioc_fapl_id)))
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "invalid IOC FAPL");

    if (H5P_peek(plist_ptr, H5F_ACS_FILE_DRV_NAME, &driver_prop) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get driver ID & info");
    if (NULL == (driver = (H5FD_class_t *)H5I_object(driver_prop.driver_id)))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL,
                                "invalid driver ID in file access property list");

    if (driver->value != H5_VFD_IOC)
        H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
                                "unable to open file '%s' - only IOC VFD is currently supported for subfiles",
                                name);

    /* Fully resolve the given filepath and get its dirname */
    if (H5_resolve_pathname(name, file_ptr->comm, &file_ptr->file_path) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve filepath");
    if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get filepath dirname");

    /*
     * Create/open the HDF5 stub file and get its inode value for
     * the internal mapping from file inode to subfiling context.
     */
    if (H5_open_subfiling_stub_file(file_ptr->file_path, flags, file_ptr->comm, &file_ptr->stub_file,
                                    &file_ptr->file_id) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "can't open HDF5 stub file");

    /* Set stub file ID on IOC fapl so it can reuse on open */
    if (H5_subfiling_set_file_id_prop(plist_ptr, file_ptr->file_id) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, NULL, "can't set stub file ID on FAPL");

    /* Open the HDF5 file's subfiles */
    if (NULL == (file_ptr->sf_file = H5FD_open(name, flags, file_ptr->fa.ioc_fapl_id, HADDR_UNDEF)))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "unable to open IOC file");

    if (driver->value == H5_VFD_IOC) {
        /* Get a copy of the context ID for later use */
        file_ptr->context_id     = H5_subfile_fid_to_context(file_ptr->file_id);
        file_ptr->fa.require_ioc = true;
    }
    else if (driver->value == H5_VFD_SEC2) {
        int ioc_flags;

        /* Translate the HDF5 file open flags into standard POSIX open flags */
        ioc_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY;
        if (H5F_ACC_TRUNC & flags)
            ioc_flags |= O_TRUNC;
        if (H5F_ACC_CREAT & flags)
            ioc_flags |= O_CREAT;
        if (H5F_ACC_EXCL & flags)
            ioc_flags |= O_EXCL;

        /*
         * Open the subfiles for this HDF5 file. A subfiling
         * context ID will be returned, which is used for
         * further interactions with this file's subfiles.
         */
        if (H5_open_subfiles(file_ptr->file_path, file_ptr->file_id, &file_ptr->fa.shared_cfg, ioc_flags,
                             file_ptr->comm, &file_ptr->context_id) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiling files = %s\n",
                                    name);
    }

    if (file_ptr->mpi_rank == 0) {
        if (H5FD__subfiling__get_real_eof(file_ptr->context_id, &sf_eof) < 0)
            sf_eof = -1;
    }

    if (file_ptr->mpi_size > 1) {
        if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm)))
            H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Bcast", mpi_code);
    }

    bcasted_eof = true;

    if (sf_eof < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "lead MPI process failed to get file EOF");

    file_ptr->eof       = (haddr_t)sf_eof;
    file_ptr->local_eof = file_ptr->eof;

    ret_value = (H5FD_t *)file_ptr;

done:
    if (config_ptr == &default_config)
        if (H5I_dec_ref(config_ptr->ioc_fapl_id) < 0)
            H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, NULL, "can't close IOC FAPL");

    if (NULL == ret_value) {
        if (file_ptr) {
            /* Participate in possible MPI collectives on failure */
            if (file_ptr->comm != MPI_COMM_NULL) {
                if (!bcasted_eof) {
                    sf_eof = -1;

                    if (file_ptr->mpi_size > 1) {
                        if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm)))
                            H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Bcast failed", mpi_code);
                    }
                }
            }

            if (H5FD__subfiling_close_int(file_ptr) < 0)
                H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CLOSEERROR, NULL, "couldn't close file");
        }
    }

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_open() */

static herr_t
H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
{
    int    mpi_finalized;
    int    mpi_code;
    herr_t ret_value = SUCCEED;

    assert(file_ptr);

    if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
        H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);

    if (file_ptr->sf_file && H5FD_close(file_ptr->sf_file) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close subfile");
    if (file_ptr->stub_file && H5FD_close(file_ptr->stub_file) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close HDF5 stub file");

    /* if set, close the copy of the plist for the underlying VFD. */
    if ((file_ptr->fa.ioc_fapl_id >= 0) && (H5I_dec_ref(file_ptr->fa.ioc_fapl_id) < 0))
        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close IOC FAPL");
    file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID;

    if (!mpi_finalized) {
        if (H5_mpi_comm_free(&file_ptr->comm) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
        if (H5_mpi_info_free(&file_ptr->info) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");

        if (H5_mpi_comm_free(&file_ptr->ext_comm) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
    }

    file_ptr->fail_to_encode = false;

done:
    free(file_ptr->file_path);
    file_ptr->file_path = NULL;

    H5MM_free(file_ptr->file_dir);
    file_ptr->file_dir = NULL;

    /* Release the file info */
    file_ptr = H5FL_FREE(H5FD_subfiling_t, file_ptr);

    H5_SUBFILING_FUNC_LEAVE;
}

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_close
 *
 * Purpose:     Closes an HDF5 file.
 *
 * Return:      Success:    SUCCEED
 *              Failure:    FAIL, file not closed.
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_close(H5FD_t *_file)
{
    H5FD_subfiling_t *file_ptr  = (H5FD_subfiling_t *)_file;
    herr_t            ret_value = SUCCEED;

    if (H5FD__subfiling_close_int(file_ptr) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to close file");

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_close() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_cmp
 *
 * Purpose:     Compares two files belonging to this driver using an
 *              arbitrary (but consistent) ordering.
 *
 * Return:      Success:    A value like strcmp()
 *              Failure:    never fails (arguments were checked by the
 *                          caller).
 *
 *-------------------------------------------------------------------------
 */
static int
H5FD__subfiling_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
{
    const H5FD_subfiling_t *f1        = (const H5FD_subfiling_t *)_f1;
    const H5FD_subfiling_t *f2        = (const H5FD_subfiling_t *)_f2;
    int                     ret_value = 0;

    assert(f1);
    assert(f2);

    ret_value = H5FD_cmp(f1->sf_file, f2->sf_file);

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_cmp() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_query
 *
 * Purpose:     Set the flags that this VFL driver is capable of supporting.
 *              (listed in H5FDpublic.h)
 *
 *              For now, duplicate the flags used for the MPIO VFD.
 *              Revisit this when we have a version of the subfiling VFD
 *              that is usable in serial builds.
 *
 * Return:      SUCCEED (Can't fail)
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
{
    herr_t ret_value = SUCCEED;

    /* Set the VFL feature flags that this driver supports */
    if (flags) {
        *flags = 0;
        *flags |= H5FD_FEAT_AGGREGATE_METADATA;  /* OK to aggregate metadata allocations  */
        *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
        *flags |= H5FD_FEAT_HAS_MPI;             /* This driver uses MPI */
    }

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_query() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_get_eoa
 *
 * Purpose:     Gets the end-of-address marker for the file. The EOA marker
 *              is the first address past the last byte allocated in the
 *              format address space.
 *
 * Return:      The end-of-address marker.
 *
 *-------------------------------------------------------------------------
 */
static haddr_t
H5FD__subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
    const H5FD_subfiling_t *file      = (const H5FD_subfiling_t *)_file;
    haddr_t                 ret_value = HADDR_UNDEF;

    ret_value = file->eoa;

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_get_eoa() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_set_eoa
 *
 * Purpose:     Set the end-of-address marker for the file. This function is
 *              called shortly after an existing HDF5 file is opened in order
 *              to tell the driver where the end of the HDF5 data is located.
 *
 * Return:      SUCCEED (Can't fail)
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr)
{
    H5FD_subfiling_t *file_ptr  = (H5FD_subfiling_t *)_file;
    herr_t            ret_value = SUCCEED;

    file_ptr->eoa = addr;

    /* Set EOA for HDF5 stub file */
    if (file_ptr->mpi_rank == 0) {
        if (H5FD_set_eoa(file_ptr->stub_file, type, addr) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't set HDF5 stub file EOA");
    }

    ret_value = H5FD_set_eoa(file_ptr->sf_file, type, addr);

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_set_eoa() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_get_eof
 *
 * Purpose:     Returns the end-of-file marker from the filesystem
 *              perspective.
 *
 * Return:      End of file address, the first address past the end of the
 *              "file", either the filesystem file or the HDF5 file.
 *
 *              NOTE: This VFD mimics the MPI I/O VFD and so does not try
 *              to keep the EOF updated. The EOF is mostly just needed
 *              right after the file is opened so the library can determine
 *              if the file is empty, truncated or okay.
 *
 *-------------------------------------------------------------------------
 */
static haddr_t
H5FD__subfiling_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
    const H5FD_subfiling_t *file      = (const H5FD_subfiling_t *)_file;
    haddr_t                 ret_value = HADDR_UNDEF;

    ret_value = file->eof;

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_get_eof() */

/*-------------------------------------------------------------------------
 * Function:       H5FD__subfiling_get_handle
 *
 * Purpose:        Returns the file handle of subfiling file driver.
 *
 * Returns:        SUCCEED/FAIL
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle)
{
    H5FD_subfiling_t *file      = (H5FD_subfiling_t *)_file;
    herr_t            ret_value = SUCCEED;

    if (!file_handle)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid");

    H5FD_get_vfd_handle(file->sf_file, file->fa.ioc_fapl_id, file_handle);

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_get_handle() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_read
 *
 * Purpose:     Reads SIZE bytes of data from FILE beginning at address ADDR
 *              into buffer BUF according to data transfer properties in
 *              DXPL_ID.
 *
 * Return:      Success:    SUCCEED. Result is stored in caller-supplied
 *                          buffer BUF.
 *              Failure:    FAIL, Contents of buffer BUF are undefined.
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size,
                     void *buf /*out*/)
{
    subfiling_context_t *sf_context         = NULL;
    H5FD_subfiling_t    *file_ptr           = (H5FD_subfiling_t *)_file;
    H5FD_mem_t          *io_types           = NULL;
    haddr_t             *io_addrs           = NULL;
    size_t              *io_sizes           = NULL;
    void               **io_bufs            = NULL;
    int64_t             *source_data_offset = NULL;
    int64_t             *sf_data_size       = NULL;
    int64_t             *sf_offset          = NULL;
    bool                 rank0_bcast        = false;
    int                  num_subfiles;
    herr_t               ret_value = SUCCEED;

    assert(file_ptr && file_ptr->pub.cls);
    assert(buf);

    /* Check for overflow conditions */
    if (!H5_addr_defined(addr))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %" PRIuHADDR, addr);
    if (REGION_OVERFLOW(addr, size))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL,
                                "addr overflow, addr = %" PRIuHADDR ", size = %zu", addr, size);

    /* Temporarily reject collective I/O until support is implemented (unless types are simple MPI_BYTE) */
    {
        H5FD_mpio_xfer_t xfer_mode;

        if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_CONTEXT, H5E_CANTGET, FAIL,
                                    "can't determine I/O collectivity setting");

        if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
            MPI_Datatype btype, ftype;

            if (H5CX_get_mpi_coll_datatypes(&btype, &ftype) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes");
            if (MPI_BYTE != btype || MPI_BYTE != ftype)
                H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_UNSUPPORTED, FAIL,
                                        "collective I/O is currently unsupported");
        }

        /* Determine whether a rank 0 bcast approach has been requested */
        rank0_bcast = H5CX_get_mpio_rank0_bcast();

        /*
         * If we reached here, we're still doing independent I/O regardless
         * of collectivity setting, so set that.
         */
        H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT);
    }

    /*
     * Retrieve the subfiling context object and the number
     * of subfiles.
     *
     * Given the current I/O and the I/O concentrator info,
     * we can determine some I/O transaction parameters.
     * In particular, for large I/O operations, each IOC
     * may require multiple I/Os to fulfill the user I/O
     * request. The block size and number of IOCs are used
     * to size the vectors that will be used to invoke the
     * underlying I/O operations.
     */
    sf_context = (subfiling_context_t *)H5_get_subfiling_object(file_ptr->context_id);
    assert(sf_context);
    assert(sf_context->topology);

    num_subfiles = sf_context->sf_num_subfiles;

    if (num_subfiles <= 0) {
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of subfiles (%d)",
                                num_subfiles);
    }
    else if (num_subfiles == 1) {
        /***************************************
         * No striping - just a single subfile *
         ***************************************/

        /* Make vector read call to subfile */
        if (H5FD_read_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read from subfile failed");
    }
    else {
        int64_t max_io_req_per_subfile;
        int64_t file_offset;
        int64_t block_size;
        size_t  max_depth;
        herr_t  status;
        int     num_subfiles_used = 0;
        int     first_subfile_idx = -1;

        /*************************************
         * Striping across multiple subfiles *
         *************************************/

        block_size = sf_context->sf_blocksize_per_stripe;
        max_depth  = (size / (size_t)block_size) + 2;

        /*
         * Given the number of subfiles, allocate vectors (one per subfile)
         * to contain the translation of the I/O request into a collection of
         * I/O requests.
         */
        if (NULL ==
            (source_data_offset = calloc(1, (size_t)num_subfiles * max_depth * sizeof(*source_data_offset))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate source data offset I/O vector");
        if (NULL == (sf_data_size = calloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_data_size))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate subfile data size I/O vector");
        if (NULL == (sf_offset = calloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_offset))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate subfile offset I/O vector");

        H5_CHECKED_ASSIGN(file_offset, int64_t, addr, haddr_t);

        /*
         * Get the potential set of IOC transactions; e.g., data sizes,
         * offsets and datatypes.
         */
        status = init_indep_io(sf_context,         /* IN: Context used to look up config info */
                               file_offset,        /* IN: Starting file offset */
                               size,               /* IN: I/O size */
                               1,                  /* IN: Data extent of the 'type' assumes byte */
                               max_depth,          /* IN: Maximum stripe depth */
                               source_data_offset, /* OUT: Memory offset */
                               sf_offset,          /* OUT: File offset */
                               sf_data_size,       /* OUT: Length of this contiguous block */
                               &first_subfile_idx, /* OUT: Subfile index corresponding to starting offset */
                               &num_subfiles_used, /* OUT: Number of actual subfiles used */
                               &max_io_req_per_subfile); /* OUT: Maximum number of requests to any subfile */

        if (status < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't initialize IOC transactions");

        if (max_io_req_per_subfile > 0) {
            uint32_t vector_len;

            H5_CHECKED_ASSIGN(vector_len, uint32_t, num_subfiles_used, int);

            /* Allocate I/O vectors */
            if (NULL == (io_types = malloc(vector_len * sizeof(*io_types))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O types vector");
            if (NULL == (io_addrs = malloc(vector_len * sizeof(*io_addrs))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O addresses vector");
            if (NULL == (io_sizes = malloc(vector_len * sizeof(*io_sizes))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O sizes vector");
            if (NULL == (io_bufs = malloc(vector_len * sizeof(*io_bufs))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O buffers vector");

            for (int64_t i = 0; i < max_io_req_per_subfile; i++) {
                uint32_t final_vec_len    = vector_len;
                int      next_subfile_idx = first_subfile_idx;

                /* Fill in I/O types, offsets, sizes and buffers vectors */
                for (uint32_t k = 0, vec_idx = 0; k < vector_len; k++) {
                    size_t idx = (size_t)next_subfile_idx * max_depth + (size_t)i;

                    io_types[vec_idx] = type;
                    H5_CHECKED_ASSIGN(io_addrs[vec_idx], haddr_t, sf_offset[idx], int64_t);
                    H5_CHECKED_ASSIGN(io_sizes[vec_idx], size_t, sf_data_size[idx], int64_t);
                    io_bufs[vec_idx] = ((char *)buf + source_data_offset[idx]);

                    next_subfile_idx = (next_subfile_idx + 1) % num_subfiles;

                    /* Skip 0-sized I/Os */
                    if (io_sizes[vec_idx] == 0) {
                        final_vec_len--;
                        continue;
                    }

                    vec_idx++;
                }

                if (!rank0_bcast || (file_ptr->mpi_rank == 0)) {
                    /* Make vector read call to subfile */
                    if (H5FD_read_vector(file_ptr->sf_file, final_vec_len, io_types, io_addrs, io_sizes,
                                         io_bufs) < 0)
                        H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read from subfile failed");
                }
            }

            if (rank0_bcast && (file_ptr->mpi_size > 1)) {
                H5_CHECK_OVERFLOW(size, size_t, int);
                if (MPI_SUCCESS != MPI_Bcast(buf, (int)size, MPI_BYTE, 0, file_ptr->comm))
                    H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "can't broadcast data from rank 0");
            }
        }
    }

    /* Point to the end of the current I/O */
    addr += (haddr_t)size;

    /* Update current file position and EOF */
    file_ptr->pos = addr;
    file_ptr->op  = OP_READ;

done:
    free(io_bufs);
    free(io_sizes);
    free(io_addrs);
    free(io_types);
    free(sf_offset);
    free(sf_data_size);
    free(source_data_offset);

    if (ret_value < 0) {
        /* Reset last file I/O information */
        file_ptr->pos = HADDR_UNDEF;
        file_ptr->op  = OP_UNKNOWN;
    } /* end if */

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_read() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_write
 *
 * Purpose:     Writes SIZE bytes of data to FILE beginning at address ADDR
 *              from buffer BUF according to data transfer properties in
 *              DXPL_ID.
 *
 * Return:      SUCCEED/FAIL
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size,
                      const void *buf /*in*/)
{
    subfiling_context_t *sf_context         = NULL;
    H5FD_subfiling_t    *file_ptr           = (H5FD_subfiling_t *)_file;
    const void         **io_bufs            = NULL;
    H5FD_mem_t          *io_types           = NULL;
    haddr_t             *io_addrs           = NULL;
    size_t              *io_sizes           = NULL;
    int64_t             *source_data_offset = NULL;
    int64_t             *sf_data_size       = NULL;
    int64_t             *sf_offset          = NULL;
    int                  num_subfiles;
    herr_t               ret_value = SUCCEED;

    assert(file_ptr && file_ptr->pub.cls);
    assert(buf);

    /* Check for overflow conditions */
    if (!H5_addr_defined(addr))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %" PRIuHADDR, addr);
    if (REGION_OVERFLOW(addr, size))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL,
                                "addr overflow, addr = %" PRIuHADDR ", size = %zu", addr, size);

    /* Temporarily reject collective I/O until support is implemented (unless types are simple MPI_BYTE) */
    {
        H5FD_mpio_xfer_t xfer_mode;

        if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_CONTEXT, H5E_CANTGET, FAIL,
                                    "can't determine I/O collectivity setting");

        if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
            MPI_Datatype btype, ftype;

            if (H5CX_get_mpi_coll_datatypes(&btype, &ftype) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes");
            if (MPI_BYTE != btype || MPI_BYTE != ftype)
                H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_UNSUPPORTED, FAIL,
                                        "collective I/O is currently unsupported");
        }

        /*
         * If we reached here, we're still doing independent I/O regardless
         * of collectivity setting, so set that.
         */
        H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT);
    }

    /*
     * Retrieve the subfiling context object and the number
     * of subfiles.
     *
     * Given the current I/O and the I/O concentrator info,
     * we can determine some I/O transaction parameters.
     * In particular, for large I/O operations, each IOC
     * may require multiple I/Os to fulfill the user I/O
     * request. The block size and number of IOCs are used
     * to size the vectors that will be used to invoke the
     * underlying I/O operations.
     */
    sf_context = (subfiling_context_t *)H5_get_subfiling_object(file_ptr->context_id);
    assert(sf_context);
    assert(sf_context->topology);

    num_subfiles = sf_context->sf_num_subfiles;

    if (num_subfiles <= 0) {
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of subfiles (%d)",
                                num_subfiles);
    }
    else if (num_subfiles == 1) {
        /***************************************
         * No striping - just a single subfile *
         ***************************************/

        /* Make vector write call to subfile */
        if (H5FD_write_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed");

        /*
         * Mirror superblock writes to the stub file so that
         * legacy HDF5 applications can check what type of
         * file they are reading
         */
        if ((type == H5FD_MEM_SUPER) && (file_ptr->mpi_rank == 0)) {
            if (H5FD_write_vector(file_ptr->stub_file, 1, &type, &addr, &size, &buf) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,
                                        "couldn't write superblock information to stub file");
        }
    }
    else {
        int64_t max_io_req_per_subfile;
        int64_t file_offset;
        int64_t block_size;
        size_t  max_depth;
        herr_t  status;
        int     num_subfiles_used = 0;
        int     first_subfile_idx = -1;

        /*************************************
         * Striping across multiple subfiles *
         *************************************/

        block_size = sf_context->sf_blocksize_per_stripe;
        max_depth  = (size / (size_t)block_size) + 2;

        /*
         * Given the number of subfiles, allocate vectors (one per subfile)
         * to contain the translation of the I/O request into a collection of
         * I/O requests.
         */
        if (NULL ==
            (source_data_offset = calloc(1, (size_t)num_subfiles * max_depth * sizeof(*source_data_offset))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate source data offset I/O vector");
        if (NULL == (sf_data_size = calloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_data_size))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate subfile data size I/O vector");
        if (NULL == (sf_offset = calloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_offset))))
            H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                    "can't allocate subfile offset I/O vector");

        H5_CHECKED_ASSIGN(file_offset, int64_t, addr, haddr_t);

        /*
         * Get the potential set of IOC transactions; e.g., data sizes,
         * offsets and datatypes.
         */
        status = init_indep_io(sf_context,         /* IN: Context used to look up config info */
                               file_offset,        /* IN: Starting file offset */
                               size,               /* IN: I/O size */
                               1,                  /* IN: Data extent of the 'type' assumes byte */
                               max_depth,          /* IN: Maximum stripe depth */
                               source_data_offset, /* OUT: Memory offset */
                               sf_offset,          /* OUT: File offset */
                               sf_data_size,       /* OUT: Length of this contiguous block */
                               &first_subfile_idx, /* OUT: Subfile index corresponding to starting offset */
                               &num_subfiles_used, /* OUT: Number of actual subfiles used */
                               &max_io_req_per_subfile); /* OUT: Maximum number of requests to any subfile */

        if (status < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't initialize IOC transactions");

        if (max_io_req_per_subfile > 0) {
            uint32_t vector_len;

            H5_CHECKED_ASSIGN(vector_len, uint32_t, num_subfiles_used, int);

            /* Allocate I/O vectors */
            if (NULL == (io_types = malloc(vector_len * sizeof(*io_types))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O types vector");
            if (NULL == (io_addrs = malloc(vector_len * sizeof(*io_addrs))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O addresses vector");
            if (NULL == (io_sizes = malloc(vector_len * sizeof(*io_sizes))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O sizes vector");
            if (NULL == (io_bufs = malloc(vector_len * sizeof(*io_bufs))))
                H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
                                        "can't allocate subfile I/O buffers vector");

            for (int64_t i = 0; i < max_io_req_per_subfile; i++) {
                uint32_t final_vec_len    = vector_len;
                int      next_subfile_idx = first_subfile_idx;

                /* Fill in I/O types, offsets, sizes and buffers vectors */
                for (uint32_t k = 0, vec_idx = 0; k < vector_len; k++) {
                    size_t idx = (size_t)next_subfile_idx * max_depth + (size_t)i;

                    io_types[vec_idx] = type;
                    H5_CHECKED_ASSIGN(io_addrs[vec_idx], haddr_t, sf_offset[idx], int64_t);
                    H5_CHECKED_ASSIGN(io_sizes[vec_idx], size_t, sf_data_size[idx], int64_t);
                    io_bufs[vec_idx] = ((const char *)buf + source_data_offset[idx]);

                    next_subfile_idx = (next_subfile_idx + 1) % num_subfiles;

                    /* Skip 0-sized I/Os */
                    if (io_sizes[vec_idx] == 0) {
                        final_vec_len--;
                        continue;
                    }

                    vec_idx++;
                }

                /* Make vector write call to subfile */
                if (H5FD_write_vector(file_ptr->sf_file, final_vec_len, io_types, io_addrs, io_sizes,
                                      io_bufs) < 0)
                    H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed");

                /*
                 * Mirror superblock writes to the stub file so that
                 * legacy HDF5 applications can check what type of
                 * file they are reading
                 */
                if (file_ptr->mpi_rank == 0) {
                    for (size_t count_idx = 0; count_idx < (size_t)final_vec_len; count_idx++) {
                        if (io_types[count_idx] == H5FD_MEM_SUPER) {
                            if (H5FD_write(file_ptr->stub_file, H5FD_MEM_SUPER, io_addrs[count_idx],
                                           io_sizes[count_idx], io_bufs[count_idx]) < 0)
                                H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,
                                                        "couldn't write superblock information to stub file");
                        }
                    }
                }
            }
        }
    }

    /* Point to the end of the current I/O */
    addr += (haddr_t)size;

    /* Update current file position and EOF */
    file_ptr->pos = addr;
    file_ptr->op  = OP_WRITE;

    /* Mimic the MPI I/O VFD */
    file_ptr->eof = HADDR_UNDEF;

    if (file_ptr->pos > file_ptr->local_eof)
        file_ptr->local_eof = file_ptr->pos;

done:
    free(io_bufs);
    free(io_sizes);
    free(io_addrs);
    free(io_types);
    free(sf_offset);
    free(sf_data_size);
    free(source_data_offset);

    if (ret_value < 0) {
        /* Reset last file I/O information */
        file_ptr->pos = HADDR_UNDEF;
        file_ptr->op  = OP_UNKNOWN;
    } /* end if */

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_write() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfile_read_vector  (internal function)
 *
 * Purpose:     Vector Read function for the sub-filing VFD.
 *
 *              Perform count reads from the specified file at the offsets
 *              provided in the addrs array, with the lengths and memory
 *              types provided in the sizes and types arrays.  Data read
 *              is returned in the buffers provided in the bufs array.
 *
 *              All reads are done according to the data transfer property
 *              list dxpl_id (which may be the constant H5P_DEFAULT).
 *
 * Return:      Success:    SUCCEED
 *                          All reads have completed successfully, and
 *                          the results havce been into the supplied
 *                          buffers.
 *
 *              Failure:    FAIL
 *                          The contents of supplied buffers are undefined.
 *
 * Notes:       Thus function doesn't actually implement vector read.
 *              Instead, it converts the vector read call into a series
 *              of scalar read calls.  Fix this when time permits.
 *
 *              Also, it didn't support the sizes and types optimization.
 *              I implemented a version of this which is more generous
 *              than that currently defined in the RFC.  This is good
 *              enough for now, but the final version should follow
 *              the RFC.
 *                                                    JRM -- 10/5/21
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_read_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
                            size_t sizes[], void *bufs[] /* out */)
{
    H5FD_subfiling_t *file_ptr  = (H5FD_subfiling_t *)_file;
    H5FD_mpio_xfer_t  xfer_mode = H5FD_MPIO_INDEPENDENT;
    herr_t            ret_value = SUCCEED; /* Return value             */

    /* Check arguments
     * RAW - Do we really need to check arguments once again?
     * These have already been checked in H5FD__subfiling_read_vector (see below)!
     */
    if (!file_ptr)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL");

    if ((!types) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "types parameter can't be NULL if count is positive");

    if ((!addrs) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "addrs parameter can't be NULL if count is positive");

    if ((!sizes) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "sizes parameter can't be NULL if count is positive");

    if ((!bufs) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "bufs parameter can't be NULL if count is positive");

    /* Get the default dataset transfer property list if the user didn't provide one */
    if (H5P_DEFAULT == dxpl_id) {
        dxpl_id = H5P_DATASET_XFER_DEFAULT;
    }
    else {
        if (true != H5P_isa_class(dxpl_id, H5P_DATASET_XFER))
            H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list");
    }

    /* Set DXPL for operation */
    H5CX_set_dxpl(dxpl_id);

    /* TODO: setup real support for vector I/O */
    if (file_ptr->fa.require_ioc) {

        bool       extend_sizes = false;
        bool       extend_types = false;
        int        k;
        size_t     size;
        H5FD_mem_t type;
        haddr_t    eoa;

        assert((count == 0) || (sizes[0] != 0));
        assert((count == 0) || (types[0] != H5FD_MEM_NOLIST));

        if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_CONTEXT, H5E_CANTGET, FAIL,
                                    "can't determine I/O collectivity setting");

        /* Currently, treat collective calls as independent */
        if (xfer_mode != H5FD_MPIO_INDEPENDENT)
            if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_CONTEXT, H5E_CANTSET, FAIL, "can't set I/O collectivity setting");

        /* Note that the following code does not let the sub-filing VFD participate
         * in collective calls when there is no data to write.  This is not an issue
         * now, as we don't do anything special with collective operations.  However
         * this needs to be fixed.
         */
        for (k = 0; k < (int)count; k++) {

            if (!extend_sizes) {

                if (sizes[k] == 0) {

                    extend_sizes = true;
                    size         = sizes[k - 1];
                }
                else {

                    size = sizes[k];
                }
            }

            if (!extend_types) {

                if (types[k] == H5FD_MEM_NOLIST) {

                    extend_types = true;
                    type         = types[k - 1];
                }
                else {

                    type = types[k];
                }
            }

            if (HADDR_UNDEF == (eoa = H5FD__subfiling_get_eoa(_file, type)))
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "driver get_eoa request failed");

            if ((addrs[k] + size) > eoa)
                H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL,
                                        "addr overflow, addrs[%d] = %llu, sizes[%d] = %llu, eoa = %llu",
                                        (int)k, (unsigned long long)(addrs[k]), (int)k,
                                        (unsigned long long)size, (unsigned long long)eoa);

            if (H5FD__subfiling_read(_file, type, dxpl_id, addrs[k], size, bufs[k]) != SUCCEED)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file vector read request failed");
        }
    }
    else {
        /* sec2 driver..
         * Call the subfiling 'direct write' version
         * of subfiling.
         */
        if (H5FD_read_vector(_file, count, types, addrs, sizes, bufs) != SUCCEED)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file vector read request failed");
    }

done:
    if (xfer_mode != H5FD_MPIO_INDEPENDENT)
        if (H5CX_set_io_xfer_mode(xfer_mode) < 0)
            H5_SUBFILING_DONE_ERROR(H5E_CONTEXT, H5E_CANTSET, FAIL, "can't set I/O collectivity setting");

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_read_vector() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfile_write_vector  (internal function)
 *
 * Purpose:     Perform count writes to the specified file at the offsets
 *              provided in the addrs array. Lengths and memory
 *              types provided in the sizes and types arrays.  Data to be
 *              written is referenced by the bufs array.
 *
 *              All writes are done according to the data transfer property
 *              list dxpl_id (which may be the constant H5P_DEFAULT).
 *
 * Return:      Success:    SUCCEED
 *                          All writes have completed successfully.
 *
 *              Failure:    FAIL
 *                          An internal error was encountered, e.g the
 *                          input arguments are not valid, or the actual
 *                          subfiling writes have failed for some reason.
 *
 * Notes:       Thus function doesn't actually implement vector write.
 *              Instead, it converts the vector write call into a series
 *              of scalar read calls.  Fix this when time permits.
 *
 *              Also, it didn't support the sizes and types optimization.
 *              I implemented a version of this which is more generous
 *              than that currently defined in the RFC.  This is good
 *              enough for now, but the final version should follow
 *              the RFC.
 *                                                    JRM -- 10/5/21
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_write_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[],
                             haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */)
{
    H5FD_subfiling_t *file_ptr  = (H5FD_subfiling_t *)_file;
    H5FD_mpio_xfer_t  xfer_mode = H5FD_MPIO_INDEPENDENT;
    herr_t            ret_value = SUCCEED; /* Return value             */

    assert(file_ptr != NULL); /* sanity check */

    /* Check arguments
     * RAW - Do we really need to check arguments once again?
     * These have already been checked in H5FD__subfiling_write_vector (see below)!
     */
    if (!file_ptr)
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL");

    if ((!types) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "types parameter can't be NULL if count is positive");

    if ((!addrs) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "addrs parameter can't be NULL if count is positive");

    if ((!sizes) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "sizes parameter can't be NULL if count is positive");

    if ((!bufs) && (count > 0))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
                                "bufs parameter can't be NULL if count is positive");

    /* Get the default dataset transfer property list if the user didn't provide one */
    if (H5P_DEFAULT == dxpl_id) {
        dxpl_id = H5P_DATASET_XFER_DEFAULT;
    }
    else {
        if (true != H5P_isa_class(dxpl_id, H5P_DATASET_XFER))
            H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list");
    }
    /* Call the subfiling IOC write*/
    if (file_ptr->fa.require_ioc) {

        bool       extend_sizes = false;
        bool       extend_types = false;
        int        k;
        size_t     size;
        H5FD_mem_t type;
        haddr_t    eoa;

        assert((count == 0) || (sizes[0] != 0));
        assert((count == 0) || (types[0] != H5FD_MEM_NOLIST));

        if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_CONTEXT, H5E_CANTGET, FAIL,
                                    "can't determine I/O collectivity setting");

        /* Currently, treat collective calls as independent */
        if (xfer_mode != H5FD_MPIO_INDEPENDENT)
            if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_CONTEXT, H5E_CANTSET, FAIL, "can't set I/O collectivity setting");

        /* Note that the following code does not let the sub-filing VFD participate
         * in collective calls when there is no data to write.  This is not an issue
         * now, as we don't do anything special with collective operations.  However
         * this needs to be fixed.
         */
        for (k = 0; k < (int)count; k++) {

            if (!extend_sizes) {

                if (sizes[k] == 0) {

                    extend_sizes = true;
                    size         = sizes[k - 1];
                }
                else {

                    size = sizes[k];
                }
            }

            if (!extend_types) {

                if (types[k] == H5FD_MEM_NOLIST) {

                    extend_types = true;
                    type         = types[k - 1];
                }
                else {

                    type = types[k];
                }
            }

            if (HADDR_UNDEF == (eoa = H5FD__subfiling_get_eoa(_file, type)))
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "driver get_eoa request failed");

            if ((addrs[k] + size) > eoa)
                H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL,
                                        "addr overflow, addrs[%d] = %llu, sizes[%d] = %llu, eoa = %llu",
                                        (int)k, (unsigned long long)(addrs[k]), (int)k,
                                        (unsigned long long)size, (unsigned long long)eoa);

            if (H5FD__subfiling_write(_file, type, dxpl_id, addrs[k], size, bufs[k]) != SUCCEED)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed");
        }
    }
    else {
        /* sec2 driver..
         * Call the subfiling 'direct write' version
         * of subfiling.
         */
        if (H5FD_write_vector(_file, count, types, addrs, sizes, bufs) != SUCCEED)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed");
    }

done:
    if (xfer_mode != H5FD_MPIO_INDEPENDENT)
        if (H5CX_set_io_xfer_mode(xfer_mode) < 0)
            H5_SUBFILING_DONE_ERROR(H5E_CONTEXT, H5E_CANTSET, FAIL, "can't set I/O collectivity setting");

    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FDsubfile__write_vector() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_truncate
 *
 * Purpose:     Makes sure that the true file size is the same as
 *              the end-of-allocation.
 *
 * Return:      SUCCEED/FAIL
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, bool H5_ATTR_UNUSED closing)
{
    H5FD_subfiling_t *file      = (H5FD_subfiling_t *)_file;
    herr_t            ret_value = SUCCEED; /* Return value */

    assert(file);

    /* Extend the file to make sure it's large enough */
    if (!H5_addr_eq(file->eoa, file->last_eoa)) {
        int64_t sf_eof;
        int64_t eoa;
        int     mpi_code;

        if (!H5CX_get_mpi_file_flushing()) {
            if (file->mpi_size > 1)
                if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
                    H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
        }

        if (0 == file->mpi_rank) {
            if (H5FD__subfiling__get_real_eof(file->context_id, &sf_eof) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get EOF");
        }

        if (file->mpi_size > 1) {
            if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file->comm)))
                H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
        }

        if (sf_eof < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid EOF");

        H5_CHECKED_ASSIGN(eoa, int64_t, file->eoa, haddr_t);

        /* truncate subfiles */
        /* This is a hack.  We should be doing the truncate of the subfiles via calls to
         * H5FD_truncate() with the IOC.  However, that system is messed up at present.
         * thus the following hack.
         *                                                 JRM -- 12/18/21
         */
        if (H5FD__subfiling__truncate_sub_files(file->context_id, eoa, file->comm) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "subfile truncate request failed");

#if 0 /* TODO: Should be truncated only to size of superblock metadata */
        /* Truncate the HDF5 stub file */
        if (file->mpi_rank == 0) {
            if (H5FD_truncate(file->stub_file, closing) < 0)
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "stub file truncate request failed");
        }
#endif

        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op  = OP_UNKNOWN;

        /* Update the 'last' eoa value */
        file->last_eoa = file->eoa;
    }

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_truncate() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_lock
 *
 * Purpose:     To place an advisory lock on a file.
 *      The lock type to apply depends on the parameter "rw":
 *          true--opens for write: an exclusive lock
 *          false--opens for read: a shared lock
 *
 * Return:      SUCCEED/FAIL
 *
 *-------------------------------------------------------------------------
 */
#if 0
static herr_t
H5FD__subfiling_lock(H5FD_t *_file, bool rw)
{
    H5FD_subfiling_t *file      = (H5FD_subfiling_t *)_file; /* VFD file struct  */
    herr_t            ret_value = SUCCEED;                   /* Return value       */

    assert(file);

    if (file->fa.require_ioc) {
#ifdef VERBOSE
        puts("Subfiling driver doesn't support file locking");
#endif
    }
    else {
        if (H5FD_lock(file->sf_file, rw) < 0)
            H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to lock file");
    } /* end if */

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_lock() */

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_unlock
 *
 * Purpose:     To remove the existing lock on the file
 *
 * Return:      SUCCEED/FAIL
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_unlock(H5FD_t *_file)
{
    H5FD_subfiling_t *file      = (H5FD_subfiling_t *)_file; /* VFD file struct */
    herr_t            ret_value = SUCCEED;                   /* Return value             */

    assert(file);

    if (H5FD_unlock(file->sf_file) < 0)
        H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to lock file");

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_unlock() */
#endif

static herr_t
H5FD__subfiling_del(const char *name, hid_t fapl)
{
    const H5FD_subfiling_config_t *subfiling_config = NULL;
    H5FD_subfiling_config_t        default_config;
    H5P_genplist_t                *plist     = NULL;
    herr_t                         ret_value = SUCCEED;

    if (NULL == (plist = H5P_object_verify(fapl, H5P_FILE_ACCESS)))
        H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");

    if (H5FD_SUBFILING != H5P_peek_driver(plist))
        H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect driver set on FAPL");

    if (NULL == (subfiling_config = H5P_peek_driver_info(plist))) {
        if (H5FD__subfiling_get_default_config(fapl, &default_config) < 0)
            H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
                                    "can't get default Subfiling VFD configuration");
        subfiling_config = &default_config;
    }

    if (H5FD_delete(name, subfiling_config->ioc_fapl_id) < 0)
        H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTDELETE, FAIL, "unable to delete file");

done:
    if (subfiling_config == &default_config)
        if (H5I_dec_ref(subfiling_config->ioc_fapl_id) < 0)
            H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, FAIL, "unable to close IOC FAPL");

    H5_SUBFILING_FUNC_LEAVE_API;
}

/*-------------------------------------------------------------------------
 * Function:    H5FD__subfiling_ctl
 *
 * Purpose:     Subfiling version of the ctl callback.
 *
 *              The desired operation is specified by the op_code
 *              parameter.
 *
 *              The flags parameter controls management of op_codes that
 *              are unknown to the callback
 *
 *              The input and output parameters allow op_code specific
 *              input and output
 *
 *              At present, the supported op codes are:
 *
 *                  H5FD_CTL_GET_MPI_COMMUNICATOR_OPCODE
 *                  H5FD_CTL_GET_MPI_RANK_OPCODE
 *                  H5FD_CTL_GET_MPI_SIZE_OPCODE
 *
 *              Note that these opcodes must be supported by all VFDs that
 *              support MPI.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD__subfiling_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void H5_ATTR_UNUSED *input,
                    void **output)
{
    H5FD_subfiling_t *file      = (H5FD_subfiling_t *)_file;
    herr_t            ret_value = SUCCEED; /* Return value */

    /* Sanity checks */
    assert(file);
    assert(H5FD_SUBFILING == file->pub.driver_id);

    switch (op_code) {

        case H5FD_CTL_GET_MPI_COMMUNICATOR_OPCODE:
            assert(output);
            assert(*output);

            /*
             * Return a separate MPI communicator to the caller so
             * that our own MPI calls won't have a chance to conflict
             */
            if (file->ext_comm == MPI_COMM_NULL) {
                if (H5_mpi_comm_dup(file->comm, &file->ext_comm) < 0)
                    H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't duplicate MPI communicator");
            }

            **((MPI_Comm **)output) = file->ext_comm;
            break;

        case H5FD_CTL_GET_MPI_RANK_OPCODE:
            assert(output);
            assert(*output);
            **((int **)output) = file->mpi_rank;
            break;

        case H5FD_CTL_GET_MPI_SIZE_OPCODE:
            assert(output);
            assert(*output);
            **((int **)output) = file->mpi_size;
            break;

        default: /* unknown op code */
            if (flags & H5FD_CTL_FAIL_IF_UNKNOWN_FLAG) {
                H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_FCNTL, FAIL, "unknown op_code and fail if unknown");
            }
            break;
    }

done:
    H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_ctl() */

/*-------------------------------------------------------------------------
 * Function:    init_indep_io
 *
 * Purpose:     Utility function to initialize the set of I/O transactions
 *              used to communicate with I/O concentrators for read and
 *              write I/O operations.
 *
 *              Fills the I/O vectors contained in the output arrays
 *              `mem_buf_offset`, `target_file_offset` and `io_block_len`.
 *              As a consequence of not allowing use of MPI derived
 *              datatypes in the VFD layer, we need to accommodate the
 *              possibility that large I/O transactions will be required to
 *              use multiple I/Os per subfile.
 *
 *              Example: Using 4 subfiles, each with 1M stripe-depth; when
 *              presented an I/O request for 8MB then at a minimum each
 *              subfile will require 2 I/Os of 1MB each.  Depending on the
 *              starting file offset, the 2 I/Os can instead be 3...
 *
 *              To fully describe the I/O transactions for reads and writes
 *              the output arrays are therefore arrays of I/O vectors,
 *              where each vector has a length of which corresponds to the
 *              max number of I/O transactions per subfile. In the example
 *              above, these vector lengths can be 2 or 3. The actual
 *              length is determined by the 'container_depth' variable.
 *
 *              For I/O operations which involve a subset of subfiles, the
 *              vector entries for the unused subfiles will have lengths of
 *              zero and be empty. The 'container_depth' in this case will
 *              always be 1.
 *
 *              sf_context (IN)
 *                - the subfiling context for the file
 *
 *              file_offset (IN)
 *                - the starting file offset for I/O
 *
 *              io_nelemts (IN)
 *                - the number of data elements for the I/O operation
 *
 *              dtype_extent (IN)
 *                - the extent of the datatype of each data element for
 *                  the I/O operation
 *
 *              max_iovec_len (IN)
 *                - the maximum size for a single I/O vector in each of
 *                  the output arrays `mem_buf_offset`, `io_block_len`
 *                  and `sf_offset`. NOTE that this routine expects each
 *                  of these output arrays to have enough space allocated
 *                  for one I/O vector PER subfile. Therefore, the total
 *                  size of each output array should be at least
 *                  `max_iovec_len * num_subfiles`.
 *
 *              mem_buf_offset (OUT)
 *                - output array of vectors (one vector for each subfile)
 *                  containing the set of offsets into the memory buffer
 *                  for I/O
 *
 *              target_file_offset (OUT)
 *                - output array of vectors (one vector for each subfile)
 *                  containing the set of offsets into the target file
 *
 *              io_block_len (OUT)
 *                - output array of vectors (one vector for each subfile)
 *                  containing the set of block lengths for each source
 *                  buffer/target file offset.
 *
 *              first_subfile_index (OUT)
 *                - the index of the first subfile that this I/O operation
 *                  begins at
 *
 *              n_subfiles_used (OUT)
 *                - the number of subfiles actually used for this I/O
 *                  operation, which may be different from the total
 *                  number of subfiles for the file
 *
 *              max_io_req_per_subfile (OUT)
 *                - the maximum number of I/O requests to any particular
 *                  subfile, or the maximum "depth" of each I/O vector
 *                  in the output arrays.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
static herr_t
init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_nelemts, size_t dtype_extent,
              size_t max_iovec_len, int64_t *mem_buf_offset, int64_t *target_file_offset,
              int64_t *io_block_len, int *first_subfile_index, int *n_subfiles_used,
              int64_t *max_io_req_per_subfile)
{
    int64_t stripe_size          = 0;
    int64_t block_size           = 0;
    int64_t data_size            = 0;
    int64_t stripe_idx           = 0;
    int64_t final_stripe_idx     = 0;
    int64_t curr_stripe_idx      = 0;
    int64_t offset_in_stripe     = 0;
    int64_t offset_in_block      = 0;
    int64_t final_offset         = 0;
    int64_t start_length         = 0;
    int64_t final_length         = 0;
    int64_t first_subfile        = 0;
    int64_t last_subfile         = 0;
    int64_t start_row            = 0;
    int64_t row_offset           = 0;
    int64_t row_stripe_idx_start = 0;
    int64_t row_stripe_idx_final = 0;
    int64_t max_iovec_depth      = 0;
    int64_t curr_max_iovec_depth = 0;
    int64_t total_bytes          = 0;
    int64_t mem_offset           = 0;
    int     num_subfiles         = 0;
    herr_t  ret_value            = SUCCEED;

    assert(sf_context);
    assert(sf_context->sf_stripe_size > 0);
    assert(sf_context->sf_blocksize_per_stripe > 0);
    assert(sf_context->sf_num_subfiles > 0);
    assert(sf_context->topology);
    assert(mem_buf_offset);
    assert(target_file_offset);
    assert(io_block_len);
    assert(first_subfile_index);
    assert(n_subfiles_used);
    assert(max_io_req_per_subfile);

    *first_subfile_index    = 0;
    *n_subfiles_used        = 0;
    *max_io_req_per_subfile = 0;

    /*
     * Retrieve the needed fields from the subfiling context.
     *
     *  stripe_size
     *    - the size of the data striping across the file's subfiles
     *  block_size
     *    - the size of a "block" across the IOCs, as calculated
     *      by the stripe size multiplied by the number of
     *      subfiles
     *  num_subfiles
     *    - the total number of subfiles for the logical
     *      HDF5 file
     *  num_io_concentrators
     *    - the number of I/O concentrators currently being
     *      used
     */
    stripe_size  = sf_context->sf_stripe_size;
    block_size   = sf_context->sf_blocksize_per_stripe;
    num_subfiles = sf_context->sf_num_subfiles;

    H5_CHECKED_ASSIGN(data_size, int64_t, (io_nelemts * dtype_extent), size_t);

    /*
     * Calculate the following from the starting file offset:
     *
     *  stripe_idx
     *    - a stripe "index" given by the file offset divided by the
     *      stripe size. Note that when the file offset equals or exceeds
     *      the block size, we simply wrap around. So, for example, if 4
     *      subfiles are being used with a stripe size of 1MiB, the block
     *      size would be 4MiB and file offset 4096 would have a stripe
     *      index of 4 and reside in the same subfile as stripe index 0
     *      (offsets 0-1023)
     *  offset_in_stripe
     *    - the relative offset in the stripe that the starting file
     *      offset resides in
     *  offset_in_block
     *    - the relative offset in the "block" of stripes across the
     *      subfiles
     *  final_offset
     *    - the last offset in the virtual file covered by this I/O
     *      operation. Simply the I/O size added to the starting file
     *      offset.
     */
    stripe_idx       = file_offset / stripe_size;
    offset_in_stripe = file_offset % stripe_size;
    offset_in_block  = file_offset % block_size;
    final_offset     = file_offset + data_size;

    /* Determine the size of data written to the first and last stripes */
    start_length = MIN(data_size, (stripe_size - offset_in_stripe));
    final_length = (start_length == data_size ? 0 : final_offset % stripe_size);
    assert(start_length <= stripe_size);
    assert(final_length <= stripe_size);

    /*
     * Determine which subfile the I/O request begins in and which
     * "row" the I/O request begins in within the "block" of stripes
     * across the subfiles. Note that "row" here is just a conceptual
     * way to think of how a block of data stripes is laid out across
     * the subfiles. A block's "column" size in bytes is equal to the
     * stripe size multiplied by the number of subfiles. Therefore,
     * file offsets that are multiples of the block size begin a new
     * "row".
     */
    start_row     = stripe_idx / num_subfiles;
    first_subfile = stripe_idx % num_subfiles;
    H5_CHECK_OVERFLOW(first_subfile, int64_t, int);

    /*
     * Set initial file offset for starting "row"
     * based on the start row index
     */
    row_offset = start_row * block_size;

    /*
     * Determine the stripe "index" of the last offset in the
     * virtual file and, from that, determine the subfile that
     * the I/O request ends in.
     */
    final_stripe_idx = final_offset / stripe_size;
    last_subfile     = final_stripe_idx % num_subfiles;

    /*
     * Determine how "deep" the resulting I/O vectors are at
     * most by calculating the maximum number of "rows" spanned
     * for any particular subfile; e.g. the maximum number of
     * I/O requests for any particular subfile
     */
    row_stripe_idx_start = stripe_idx - first_subfile;
    row_stripe_idx_final = final_stripe_idx - last_subfile;
    max_iovec_depth      = ((row_stripe_idx_final - row_stripe_idx_start) / num_subfiles) + 1;

    if (last_subfile < first_subfile)
        max_iovec_depth--;

    /* Set returned parameters early */
    *first_subfile_index    = (int)first_subfile;
    *n_subfiles_used        = num_subfiles;
    *max_io_req_per_subfile = max_iovec_depth;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: FILE OFFSET = %" PRId64 ", DATA SIZE = %zu, STRIPE SIZE = %" PRId64, __func__,
                     file_offset, io_nelemts, stripe_size);
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: FIRST SUBFILE = %" PRId64 ", LAST SUBFILE = %" PRId64 ", "
                     "MAX IOVEC DEPTH = %" PRId64 ", START LENGTH = %" PRId64 ", FINAL LENGTH = %" PRId64,
                     __func__, first_subfile, last_subfile, max_iovec_depth, start_length, final_length);
#endif

    /*
     * Loop through the set of subfiles to determine the various
     * vector components for each. Subfiles whose data size is
     * zero will not have I/O requests passed to them.
     */
    curr_stripe_idx      = stripe_idx;
    curr_max_iovec_depth = max_iovec_depth;
    for (int i = 0, k = (int)first_subfile; i < num_subfiles; i++) {
        int64_t *_mem_buf_offset;
        int64_t *_target_file_offset;
        int64_t *_io_block_len;
        int64_t  subfile_bytes = 0;
        int64_t  iovec_depth;
        bool     is_first = false;
        bool     is_last  = false;
        size_t   output_offset;

        iovec_depth = curr_max_iovec_depth;

        /*
         * Setup the pointers to the next set of I/O vectors in
         * the output arrays and clear those vectors
         */
        output_offset       = (size_t)(k)*max_iovec_len;
        _mem_buf_offset     = mem_buf_offset + output_offset;
        _target_file_offset = target_file_offset + output_offset;
        _io_block_len       = io_block_len + output_offset;

        memset(_mem_buf_offset, 0, (max_iovec_len * sizeof(*_mem_buf_offset)));
        memset(_target_file_offset, 0, (max_iovec_len * sizeof(*_target_file_offset)));
        memset(_io_block_len, 0, (max_iovec_len * sizeof(*_io_block_len)));

        if (total_bytes == data_size) {
            *n_subfiles_used = i;
            goto done;
        }

        if (total_bytes < data_size) {
            int64_t num_full_stripes = iovec_depth;

            if (k == first_subfile) {
                is_first = true;

                /*
                 * Add partial segment length if not
                 * starting on a stripe boundary
                 */
                if (start_length < stripe_size) {
                    subfile_bytes += start_length;
                    num_full_stripes--;
                }
            }

            if (k == last_subfile) {
                is_last = true;

                /*
                 * Add partial segment length if not
                 * ending on a stripe boundary
                 */
                if (final_length < stripe_size) {
                    subfile_bytes += final_length;
                    if (num_full_stripes)
                        num_full_stripes--;
                }
            }

            /* Account for subfiles with uniform segments */
            if (!is_first && !is_last) {
                bool thin_uniform_section = false;

                if (last_subfile >= first_subfile) {
                    /*
                     * When a subfile has an index value that is greater
                     * than both the starting subfile and ending subfile
                     * indices, it is a "thinner" section with a smaller
                     * I/O vector depth.
                     */
                    thin_uniform_section = (k > first_subfile) && (k > last_subfile);
                }

                if (last_subfile < first_subfile) {
                    /*
                     * This can also happen when the subfile with the final
                     * data segment has a smaller subfile index than the
                     * subfile with the first data segment and the current
                     * subfile index falls between the two.
                     */
                    thin_uniform_section =
                        thin_uniform_section || ((last_subfile < k) && (k < first_subfile));
                }

                if (thin_uniform_section) {
                    assert(iovec_depth > 1);
                    assert(num_full_stripes > 1);

                    iovec_depth--;
                    num_full_stripes--;
                }
            }

            /*
             * After accounting for the length of the initial
             * and/or final data segments, add the combined
             * size of the fully selected I/O stripes to the
             * running bytes total
             */
            subfile_bytes += num_full_stripes * stripe_size;
            total_bytes += subfile_bytes;
        }

        _mem_buf_offset[0]     = mem_offset;
        _target_file_offset[0] = row_offset + offset_in_block;
        _io_block_len[0]       = subfile_bytes;

        if (num_subfiles > 1) {
            int64_t curr_file_offset = row_offset + offset_in_block;

            /* Fill the I/O vectors */
            if (is_first) {
                if (is_last) { /* First + Last */
                    if (iovec_fill_first_last(sf_context, iovec_depth, subfile_bytes, mem_offset,
                                              curr_file_offset, start_length, final_length, _mem_buf_offset,
                                              _target_file_offset, _io_block_len) < 0)
                        H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");
                }
                else { /* First ONLY */
                    if (iovec_fill_first(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset,
                                         start_length, _mem_buf_offset, _target_file_offset,
                                         _io_block_len) < 0)
                        H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");
                }
                /* Move the memory pointer to the starting location
                 * for next subfile I/O request.
                 */
                mem_offset += start_length;
            }
            else if (is_last) { /* Last ONLY */
                if (iovec_fill_last(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset,
                                    final_length, _mem_buf_offset, _target_file_offset, _io_block_len) < 0)
                    H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");

                mem_offset += stripe_size;
            }
            else { /* Everything else (uniform) */
                if (iovec_fill_uniform(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset,
                                       _mem_buf_offset, _target_file_offset, _io_block_len) < 0)
                    H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");

                mem_offset += stripe_size;
            }
        }

        offset_in_block += _io_block_len[0];

        k++;
        curr_stripe_idx++;

        if (k == num_subfiles) {
            k                    = 0;
            offset_in_block      = 0;
            curr_max_iovec_depth = ((final_stripe_idx - curr_stripe_idx) / num_subfiles) + 1;

            row_offset += block_size;
        }

        assert(offset_in_block <= block_size);
    }

    if (total_bytes != data_size)
        H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL,
                                "total bytes (%" PRId64 ") didn't match data size (%" PRId64 ")!",
                                total_bytes, data_size);

done:
    return ret_value;
}

/*-------------------------------------------------------------------------
 * Function:    iovec_fill_first
 *
 * Purpose:     Fills I/O vectors for the case where the IOC has the first
 *              data segment of the I/O operation.
 *
 *              If the 'first_io_len' is sufficient to complete the I/O to
 *              the IOC, then the first entry in the I/O vectors is simply
 *              filled out with the given starting memory/file offsets and
 *              the first I/O size. Otherwise, the remaining entries in the
 *              I/O vectors are filled out as data segments with size equal
 *              to the stripe size. Each data segment is separated from a
 *              previous or following segment by 'sf_blocksize_per_stripe'
 *              bytes of data.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
static herr_t
iovec_fill_first(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
                 int64_t start_mem_offset, int64_t start_file_offset, int64_t first_io_len,
                 int64_t *mem_offset_out, int64_t *target_file_offset_out, int64_t *io_block_len_out)
{
    int64_t stripe_size;
    int64_t block_size;
    int64_t total_bytes = 0;
    herr_t  ret_value   = SUCCEED;

    assert(sf_context);
    assert(mem_offset_out);
    assert(target_file_offset_out);
    assert(io_block_len_out);
    assert(iovec_depth > 0);

    stripe_size = sf_context->sf_stripe_size;
    block_size  = sf_context->sf_blocksize_per_stripe;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: start_mem_offset = %" PRId64 ", start_file_offset = %" PRId64
                     ", first_io_len = %" PRId64,
                     __func__, start_mem_offset, start_file_offset, first_io_len);
#endif

    mem_offset_out[0]         = start_mem_offset;
    target_file_offset_out[0] = start_file_offset;
    io_block_len_out[0]       = first_io_len;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: mem_offset[0] = %" PRId64 ", file_offset[0] = %" PRId64
                     ", io_block_len[0] = %" PRId64,
                     __func__, mem_offset_out[0], target_file_offset_out[0], io_block_len_out[0]);
#endif

    if (first_io_len == target_datasize)
        H5_SUBFILING_GOTO_DONE(SUCCEED);

    if (first_io_len > 0) {
        int64_t offset_in_stripe = start_file_offset % stripe_size;
        int64_t next_mem_offset  = block_size - offset_in_stripe;
        int64_t next_file_offset = start_file_offset + (block_size - offset_in_stripe);

        total_bytes = first_io_len;

        for (int64_t i = 1; i < iovec_depth; i++) {
            mem_offset_out[i]         = next_mem_offset;
            target_file_offset_out[i] = next_file_offset;
            io_block_len_out[i]       = stripe_size;

#ifdef H5_SUBFILING_DEBUG
            H5_subfiling_log(sf_context->sf_context_id,
                             "%s: mem_offset[%" PRId64 "] = %" PRId64 ", file_offset[%" PRId64 "] = %" PRId64
                             ", io_block_len[%" PRId64 "] = %" PRId64,
                             __func__, i, mem_offset_out[i], i, target_file_offset_out[i], i,
                             io_block_len_out[i]);
#endif

            next_mem_offset += block_size;
            next_file_offset += block_size;
            total_bytes += stripe_size;
        }

        if (total_bytes != target_datasize)
            H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL,
                                    "total bytes (%" PRId64 ") didn't match target data size (%" PRId64 ")!",
                                    total_bytes, target_datasize);
    }

done:
    return ret_value;
}

/*-------------------------------------------------------------------------
 * Function:    iovec_fill_last
 *
 * Purpose:     Fills I/O vectors for the case where the IOC has the last
 *              data segment of the I/O operation.
 *
 *              If the 'last_io_len' is sufficient to complete the I/O to
 *              the IOC, then the first entry in the I/O vectors is simply
 *              filled out with the given starting memory/file offsets and
 *              the last I/O size. Otherwise, all entries in the I/O
 *              vectors except the last entry are filled out as data
 *              segments with size equal to the stripe size. Each data
 *              segment is separated from a previous or following segment
 *              by 'sf_blocksize_per_stripe' bytes of data. Then, the last
 *              entry in the I/O vectors is filled out with the final
 *              memory/file offsets and the last I/O size.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
static herr_t
iovec_fill_last(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
                int64_t start_mem_offset, int64_t start_file_offset, int64_t last_io_len,
                int64_t *mem_offset_out, int64_t *target_file_offset_out, int64_t *io_block_len_out)
{
    int64_t stripe_size;
    int64_t block_size;
    int64_t total_bytes = 0;
    herr_t  ret_value   = SUCCEED;

    assert(sf_context);
    assert(mem_offset_out);
    assert(target_file_offset_out);
    assert(io_block_len_out);
    assert(iovec_depth > 0);

    stripe_size = sf_context->sf_stripe_size;
    block_size  = sf_context->sf_blocksize_per_stripe;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: start_mem_offset = %" PRId64 ", start_file_offset = %" PRId64
                     ", last_io_len = %" PRId64,
                     __func__, start_mem_offset, start_file_offset, last_io_len);
#endif

    mem_offset_out[0]         = start_mem_offset;
    target_file_offset_out[0] = start_file_offset;
    io_block_len_out[0]       = last_io_len;

    if (last_io_len == target_datasize) {
#ifdef H5_SUBFILING_DEBUG
        H5_subfiling_log(sf_context->sf_context_id,
                         "%s: mem_offset[0] = %" PRId64 ", file_offset[0] = %" PRId64
                         ", io_block_len[0] = %" PRId64,
                         __func__, mem_offset_out[0], target_file_offset_out[0], io_block_len_out[0]);
#endif

        H5_SUBFILING_GOTO_DONE(SUCCEED);
    }
    else {
        int64_t next_mem_offset  = start_mem_offset + block_size;
        int64_t next_file_offset = start_file_offset + block_size;
        int64_t i;

        /*
         * If the last I/O size doesn't cover the target data
         * size, there is at least one full stripe preceding
         * the last I/O block
         */
        io_block_len_out[0] = stripe_size;

#ifdef H5_SUBFILING_DEBUG
        H5_subfiling_log(sf_context->sf_context_id,
                         "%s: mem_offset[0] = %" PRId64 ", file_offset[0] = %" PRId64
                         ", io_block_len[0] = %" PRId64,
                         __func__, mem_offset_out[0], target_file_offset_out[0], io_block_len_out[0]);
#endif

        total_bytes = stripe_size;

        for (i = 1; i < iovec_depth - 1;) {
            mem_offset_out[i]         = next_mem_offset;
            target_file_offset_out[i] = next_file_offset;
            io_block_len_out[i]       = stripe_size;

#ifdef H5_SUBFILING_DEBUG
            H5_subfiling_log(sf_context->sf_context_id,
                             "%s: mem_offset[%" PRId64 "] = %" PRId64 ", file_offset[%" PRId64 "] = %" PRId64
                             ", io_block_len[%" PRId64 "] = %" PRId64,
                             __func__, i, mem_offset_out[i], i, target_file_offset_out[i], i,
                             io_block_len_out[i]);
#endif

            next_mem_offset += block_size;
            next_file_offset += block_size;
            total_bytes += stripe_size;

            i++;
        }

        mem_offset_out[i]         = next_mem_offset;
        target_file_offset_out[i] = next_file_offset;
        io_block_len_out[i]       = last_io_len;

#ifdef H5_SUBFILING_DEBUG
        H5_subfiling_log(sf_context->sf_context_id,
                         "%s: mem_offset[%" PRId64 "] = %" PRId64 ", file_offset[%" PRId64 "] = %" PRId64
                         ", io_block_len[%" PRId64 "] = %" PRId64,
                         __func__, i, mem_offset_out[i], i, target_file_offset_out[i], i,
                         io_block_len_out[i]);
#endif

        total_bytes += last_io_len;

        if (total_bytes != target_datasize)
            H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL,
                                    "total bytes (%" PRId64 ") didn't match target data size (%" PRId64 ")!",
                                    total_bytes, target_datasize);
    }

done:
    return ret_value;
}

/*-------------------------------------------------------------------------
 * Function:    iovec_fill_first_last
 *
 * Purpose:     Fills I/O vectors for the case where the IOC has the first
 *              and last data segments of the I/O operation. This function
 *              is essentially a merge of the iovec_fill_first and
 *              iovec_fill_last functions.
 *
 *              If the 'first_io_len' is sufficient to complete the I/O to
 *              the IOC, then the first entry in the I/O vectors is simply
 *              filled out with the given starting memory/file offsets and
 *              the first I/O size. Otherwise, the remaining entries in the
 *              I/O vectors except the last are filled out as data segments
 *              with size equal to the stripe size. Each data segment is
 *              separated from a previous or following segment by
 *              'sf_blocksize_per_stripe' bytes of data. Then, the last
 *              entry in the I/O vectors is filled out with the final
 *              memory/file offsets and the last I/O size.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
static herr_t
iovec_fill_first_last(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
                      int64_t start_mem_offset, int64_t start_file_offset, int64_t first_io_len,
                      int64_t last_io_len, int64_t *mem_offset_out, int64_t *target_file_offset_out,
                      int64_t *io_block_len_out)
{
    int64_t stripe_size;
    int64_t block_size;
    int64_t total_bytes = 0;
    herr_t  ret_value   = SUCCEED;

    assert(sf_context);
    assert(mem_offset_out);
    assert(target_file_offset_out);
    assert(io_block_len_out);
    assert(iovec_depth > 0);

    stripe_size = sf_context->sf_stripe_size;
    block_size  = sf_context->sf_blocksize_per_stripe;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: start_mem_offset = %" PRId64 ", start_file_offset = %" PRId64
                     ", first_io_len = %" PRId64 ", last_io_len = %" PRId64,
                     __func__, start_mem_offset, start_file_offset, first_io_len, last_io_len);
#endif

    mem_offset_out[0]         = start_mem_offset;
    target_file_offset_out[0] = start_file_offset;
    io_block_len_out[0]       = first_io_len;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: mem_offset[0] = %" PRId64 ", file_offset[0] = %" PRId64
                     ", io_block_len[0] = %" PRId64,
                     __func__, mem_offset_out[0], target_file_offset_out[0], io_block_len_out[0]);
#endif

    if (first_io_len == target_datasize)
        H5_SUBFILING_GOTO_DONE(SUCCEED);

    if (first_io_len > 0) {
        int64_t offset_in_stripe = start_file_offset % stripe_size;
        int64_t next_mem_offset  = block_size - offset_in_stripe;
        int64_t next_file_offset = start_file_offset + (block_size - offset_in_stripe);
        int64_t i;

        total_bytes = first_io_len;

        for (i = 1; i < iovec_depth - 1;) {
            mem_offset_out[i]         = next_mem_offset;
            target_file_offset_out[i] = next_file_offset;
            io_block_len_out[i]       = stripe_size;

#ifdef H5_SUBFILING_DEBUG
            H5_subfiling_log(sf_context->sf_context_id,
                             "%s: mem_offset[%" PRId64 "] = %" PRId64 ", file_offset[%" PRId64 "] = %" PRId64
                             ", io_block_len[%" PRId64 "] = %" PRId64,
                             __func__, i, mem_offset_out[i], i, target_file_offset_out[i], i,
                             io_block_len_out[i]);
#endif

            next_mem_offset += block_size;
            next_file_offset += block_size;
            total_bytes += stripe_size;

            i++;
        }

        mem_offset_out[i]         = next_mem_offset;
        target_file_offset_out[i] = next_file_offset;
        io_block_len_out[i]       = last_io_len;

#ifdef H5_SUBFILING_DEBUG
        H5_subfiling_log(sf_context->sf_context_id,
                         "%s: mem_offset[%" PRId64 "] = %" PRId64 ", file_offset[%" PRId64 "] = %" PRId64
                         ", io_block_len[%" PRId64 "] = %" PRId64,
                         __func__, i, mem_offset_out[i], i, target_file_offset_out[i], i,
                         io_block_len_out[i]);
#endif

        total_bytes += last_io_len;

        if (total_bytes != target_datasize)
            H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL,
                                    "total bytes (%" PRId64 ") didn't match target data size (%" PRId64 ")!",
                                    total_bytes, target_datasize);
    }

done:
    return ret_value;
}

/*-------------------------------------------------------------------------
 * Function:    iovec_fill_uniform
 *
 * Purpose:     Fills I/O vectors for the typical I/O operation when
 *              reading data from or writing data to an I/O Concentrator
 *              (IOC).
 *
 *              Each data segment is of 'stripe_size' length and will be
 *              separated from a previous or following segment by
 *              'sf_blocksize_per_stripe' bytes of data.
 *
 * Return:      Non-negative on success/Negative on failure
 *
 *-------------------------------------------------------------------------
 */
static herr_t
iovec_fill_uniform(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
                   int64_t start_mem_offset, int64_t start_file_offset, int64_t *mem_offset_out,
                   int64_t *target_file_offset_out, int64_t *io_block_len_out)
{
    int64_t stripe_size;
    int64_t block_size;
    int64_t total_bytes = 0;
    herr_t  ret_value   = SUCCEED;

    assert(sf_context);
    assert(mem_offset_out);
    assert(target_file_offset_out);
    assert(io_block_len_out);
    assert((iovec_depth > 0) || (target_datasize == 0));

    stripe_size = sf_context->sf_stripe_size;
    block_size  = sf_context->sf_blocksize_per_stripe;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: start_mem_offset = %" PRId64 ", start_file_offset = %" PRId64
                     ", segment size = %" PRId64,
                     __func__, start_mem_offset, start_file_offset, stripe_size);
#endif

    mem_offset_out[0]         = start_mem_offset;
    target_file_offset_out[0] = start_file_offset;
    io_block_len_out[0]       = stripe_size;

#ifdef H5_SUBFILING_DEBUG
    H5_subfiling_log(sf_context->sf_context_id,
                     "%s: mem_offset[0] = %" PRId64 ", file_offset[0] = %" PRId64
                     ", io_block_len[0] = %" PRId64,
                     __func__, mem_offset_out[0], target_file_offset_out[0], io_block_len_out[0]);
#endif

    if (target_datasize == 0) {
#ifdef H5_SUBFILING_DEBUG
        H5_subfiling_log(sf_context->sf_context_id, "%s: target_datasize = 0", __func__);
#endif

        io_block_len_out[0] = 0;
        H5_SUBFILING_GOTO_DONE(SUCCEED);
    }

    if (target_datasize > stripe_size) {
        int64_t next_mem_offset  = start_mem_offset + block_size;
        int64_t next_file_offset = start_file_offset + block_size;

        total_bytes = stripe_size;

        for (int64_t i = 1; i < iovec_depth; i++) {
            mem_offset_out[i]         = next_mem_offset;
            target_file_offset_out[i] = next_file_offset;
            io_block_len_out[i]       = stripe_size;

#ifdef H5_SUBFILING_DEBUG
            H5_subfiling_log(sf_context->sf_context_id,
                             "%s: mem_offset[%" PRId64 "] = %" PRId64 ", file_offset[%" PRId64 "] = %" PRId64
                             ", io_block_len[%" PRId64 "] = %" PRId64,
                             __func__, i, mem_offset_out[i], i, target_file_offset_out[i], i,
                             io_block_len_out[i]);
#endif

            next_mem_offset += block_size;
            next_file_offset += block_size;
            total_bytes += stripe_size;
        }

        if (total_bytes != target_datasize)
            H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL,
                                    "total bytes (%" PRId64 ") didn't match target data size (%" PRId64 ")!",
                                    total_bytes, target_datasize);
    }

done:
    return ret_value;
}
