/*****************************************************************************\
 *  gres_ve.c - Support VEs as a generic resources.
 *****************************************************************************
 *  Copyright (C) 2021 NEC Corporation.
 *  Based upon gres_gpu.c with the copyright notice shown below:
 *  Copyright (C) 2010 Lawrence Livermore National Security.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *
 *  This file is part of Slurm, a resource management program.
 *  For details, see <https://slurm.schedmd.com/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  Slurm is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with Slurm; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include <ctype.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"

#include "src/common/slurm_xlator.h"
#include "src/common/bitstring.h"
#include "src/common/env.h"
#include "src/common/gres.h"
#include "src/common/list.h"
#include "src/common/xstring.h"

#include "../common/gres_common.h"

/*
 * These variables are required by the generic plugin interface.  If they
 * are not found in the plugin, the plugin loader will ignore it.
 *
 * plugin_name - A string giving a human-readable description of the
 * plugin.  There is no maximum length, but the symbol must refer to
 * a valid string.
 *
 * plugin_type - A string suggesting the type of the plugin or its
 * applicability to a particular form of data or method of data handling.
 * If the low-level plugin API is used, the contents of this string are
 * unimportant and may be anything.  Slurm uses the higher-level plugin
 * interface which requires this string to be of the form
 *
 *	<application>/<method>
 *
 * where <application> is a description of the intended application of
 * the plugin (e.g., "auth" for Slurm authentication) and <method> is a
 * description of how this plugin satisfies that application.  Slurm will
 * only load authentication plugins if the plugin_type string has a prefix
 * of "auth/".
 *
 *  * plugin_version - an unsigned 32-bit integer containing the Slurm version
 * (major.minor.micro combined into a single number).
 */
const char	plugin_name[]		= "Gres VE plugin";
const char	plugin_type[]		= "gres/ve";
const uint32_t	plugin_version		= SLURM_VERSION_NUMBER;

static char	gres_name[]		= "ve";
static char     spooldir[]              = "/var/spool/";
static char     checked_fs_name[]       = "checkedFS";
static char     all_hcalist_name[]      = "all_hcalist";

static List gres_devices = NULL;

/*
 * Write file system information for Aurora system
 * to slurmd spool directory.
 */
static void _write_fsinfo(void)
{
	const char *check_cmd = "mount";
	char *slurmd_spooldir = NULL, *file_path = NULL, *command = NULL;
	size_t spooldir_len = strlen(spooldir);
	struct stat stat_buf;

	slurmd_spooldir = xstrdup(slurm_conf.slurmd_spooldir);
	if (!slurmd_spooldir) {
		error("%s: fail to duplicate slurmd_spooldir", __func__);
		return;
	}

	/*
	 * Do not write to any directory other than slurmd spool directory
	 * (the default value is "/var/spool/slurmd").
	 */
	if ((xstrncmp(slurmd_spooldir, spooldir, spooldir_len) != 0) ||
		(strlen(slurmd_spooldir) <= spooldir_len)) {
		error("%s: inappropriate directory: %s",
			__func__, slurmd_spooldir);
		xfree(slurmd_spooldir);
		return;
	}

	xstrfmtcat(file_path, "%s/%s", slurmd_spooldir, checked_fs_name);
	xstrfmtcat(command, "%s > %s", check_cmd, file_path);
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);
	debug("%s: file_path: %s", __func__, file_path);
	debug("%s: command: %s", __func__, command);

	// Write result of mount command as the file system information.
	if (system(command))
		error("%s: system(%s): %m", __func__, command);
	else {
		if (stat(file_path, &stat_buf) < 0)
			error("%s: stat(%s)", __func__, file_path);
	}

	xfree(slurmd_spooldir);
	xfree(file_path);
	xfree(command);

	return;
}

/*
 * Set environment variables for ve_exec command.
 */
static int _set_env_ve_exec(char ***env_ptr, char *local_list)
{
	char *ve_list = NULL, *ve_node_number = NULL;
	char *startp = NULL, *endp = NULL, *prefix = NULL;
	uint64_t ve_number, ve_number_min;

	// Duplicate all VE numbers allocated to a job or job step.
	debug("%s: local_list: %s", __func__, local_list);
	ve_list = xstrdup(local_list);
	if (!ve_list) {
		error("%s: fail to duplicate local_list", __func__);
		return SLURM_ERROR;
	}

	// Specify minimum value of VE numbers.
	startp = ve_list;
	errno = 0;
	ve_number_min = strtoul(startp, &endp, 10);
	if (*endp == ',') {
		startp = (endp + 1);
		while ((*endp != '\0')) {
			ve_number = strtoul(startp, &endp, 10);
			if ((errno != 0) || (startp == endp))
				break;
			if (ve_number < ve_number_min)
				ve_number_min = ve_number;
			if (*endp == ',')
				startp = (endp + 1);
			else
				startp = endp;
		}
	}

	if ((errno != 0) || (*endp != '\0')) {
		error("%s: invalid value local_list: %s: %m",
			__func__, ve_list);
		xfree(ve_list);
		return SLURM_ERROR;
	}

	while ((prefix = xstrchr(ve_list, ','))) {
		*prefix = ' ';
	}
	debug("%s: ve_list: %s", __func__, ve_list);
	// Set all VE numbers allocated to a job or job step.
	env_array_overwrite(env_ptr, "_VENODELIST", ve_list);

	xstrfmtcat(ve_node_number, "%"PRIu64, ve_number_min);
	debug("%s: ve_node_number: %s", __func__, ve_node_number);
	// Set minimum value of VE numbers.
	env_array_overwrite(env_ptr, "VE_NODE_NUMBER", ve_node_number);

	xfree(ve_list);
	xfree(ve_node_number);

	return SLURM_SUCCESS;
}

/*
 * Set environment variable _NEC_NQSV_JOB if Aurora system uses ScaTeFS.
 */
static void _set_scatefs_isused(char *checked_fs, char ***env_ptr)
{
	char *env_value = NULL;
	const char *search_word = "type scatefs";
	FILE *fp = NULL;
	char line[BUF_SIZE];
	bool isused = false;

	/*
	 * Check if Aurora system uses ScaTeFS or not.
	 * The file system information for Aurora system are written
	 * to a file in slurmd spool directory.
	 */
	if ((fp = fopen(checked_fs, "r")) == NULL) {
		error("%s: fopen(%s): %m", __func__, checked_fs);
		return;
	}
	debug("%s: checked_fs: %s", __func__, checked_fs);

	while (fgets(line, sizeof(line), fp)) {
		if (xstrstr(line, search_word) != NULL) {
			debug("%s: line: %s", __func__, line);
			isused = true;
			break;
		}
	}
	fclose(fp);

	if (isused)
		env_value = xstrdup("1");
	else
		env_value = xstrdup("");

	env_array_overwrite(env_ptr, "_NEC_NQSV_JOB", env_value);

	if (env_value) {
		debug("%s: env value nqsv_job: %s", __func__, env_value);
		xfree(env_value);
	}

	return;
}

/*
 * Set device name and port number for all HCAs to
 * the environment variable for ScaTeFS.
 */
static void _set_env_all_hcalist(char *all_hcalist, char ***env_ptr)
{
	char *hca_list = NULL, *ve_list = NULL, *vehca_list = NULL;
	char *startp = NULL, *prefix = NULL;
	FILE *fp = NULL;
	char line[BUF_SIZE];
	const char *search_word = "mlx5_";
	size_t search_len = strlen(search_word);

	/*
	 * Device name and port number for all HCAs are written
	 * to a file in slurmd spool directory.
	 */
	if ((fp = fopen(all_hcalist, "r")) == NULL) {
		error("%s: fopen(%s): %m", __func__, all_hcalist);
		return;
	}
	debug("%s: all_hcalist: %s", __func__, all_hcalist);

	hca_list = line;
	while (fgets(line, sizeof(line), fp)) {
		/*
	 	* Device name and port number are written in
	 	* following format.
		* <device-name>:<port-number>[,<device-name>:<port-number>…]
		* ex) mlx5_0:1,mlx5_1:1
	 	*/
		if (xstrncmp(line, search_word, search_len) != 0)
			continue;
		xstrsubstitute(hca_list, strchr(line, '\n'), "\0");
		break;
	}
	fclose(fp);

	if (xstrstr(hca_list, search_word) == NULL) {
		error("%s: no hca_list: %s", __func__, all_hcalist);
		return;
	}
	debug("%s: hca_list: %s", __func__, hca_list);

	ve_list = xstrdup(getenvp(*env_ptr, "_VENODELIST"));
	if (!ve_list) {
		error("%s: fail to get env venodelist", __func__);
		return;
	}
	debug("%s: ve_list: %s", __func__, ve_list);

	// Set device name and port number as much as number of VEs.
	xstrcat(vehca_list, hca_list);
	startp = ve_list;
	while ((prefix = xstrchr(startp, ' '))) {
		xstrcatchar(vehca_list, *prefix);
		xstrcat(vehca_list, hca_list);
		startp = (prefix + 1);
	}
	debug("%s: vehca_list: %s", __func__, vehca_list);
	env_array_overwrite(env_ptr, "_NEC_HCA_LIST_IO", vehca_list);
	env_array_overwrite(env_ptr, "_NEC_HCA_LIST_MPI", vehca_list);

	xfree(ve_list);
	xfree(vehca_list);

	return;
}

/*
 * Set environment variables for ScaTeFS(file system).
 */
static void _set_env_scatefs(char ***env_ptr)
{
	char *slurmd_spooldir = NULL, *file_path = NULL;
	struct stat stat_buf;

	slurmd_spooldir = xstrdup(slurm_conf.slurmd_spooldir);
	if (!slurmd_spooldir) {
		error("%s: fail to duplicate slurmd_spooldir", __func__);
		return;
	}
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);

	xstrfmtcat(file_path, "%s/%s", slurmd_spooldir, checked_fs_name);
	if (stat(file_path, &stat_buf) < 0)
		error("%s: stat(%s)", __func__, file_path);
	else {
		debug("%s: checked_fs: %s", __func__, file_path);
		/*
		 * Set environment variable _NEC_NQSV_JOB
		 * if Aurora system uses ScaTeFS.
		 */
		_set_scatefs_isused(file_path, env_ptr);
	}
	xfree(file_path);

	xstrfmtcat(file_path, "%s/%s", slurmd_spooldir, all_hcalist_name);
	if (stat(file_path, &stat_buf) < 0)
		error("%s: stat(%s)", __func__, file_path);
	else {
		debug("%s: all_hcalist: %s", __func__, file_path);
		/*
		 * Set device name and port number for all HCAs to
		 * the environment variable.
		 */
		_set_env_all_hcalist(file_path, env_ptr);
	}
	xfree(file_path);
	xfree(slurmd_spooldir);

	return;
}

/*
 * Get number of VE tasks to run on target VH node.
 */
static uint64_t _get_ntasks_on_node(char ***env_ptr)
{
	char *node_list = NULL, *host_name = NULL;
	char *env_value = NULL, *startp = NULL, *endp = NULL;
	char node_name[MAX_SLURM_NAME];
	hostlist_t host_list = NULL;
	unsigned int host_count = 0, node_number = 0, node_count = 0;
	uint64_t ntasks = 0, nodes = 0;

	node_list = xstrdup(getenvp(*env_ptr, "SLURM_NODELIST"));
	if (!node_list) {
		error("%s: fail to get env nodelist", __func__);
		return 0;
	}
	debug("%s: node_list: %s", __func__, node_list);

	if (gethostname(node_name, MAX_SLURM_NAME) != 0) {
		error("%s: gethostname: %m", __func__);
		xfree(node_list);
		return 0;
	}
	debug("%s: node_name: %s", __func__, node_name);

	/*
	 * Retirve list number of target VH node name
	 * in environment variable SLURM_NODELIST.
	 */
	host_list = hostlist_create(node_list);
	while ((host_name = hostlist_shift(host_list))) {
		host_count++;
		if ((strlen(host_name) == strlen(node_name)) &&
			(xstrncmp(host_name, node_name, strlen(host_name))
				== 0)) {
			node_number = host_count;
			free(host_name);
			break;
		}
		free(host_name);
	}
	hostlist_destroy(host_list);
	if (node_number == 0) {
		error("%s: fail to find hostname: %s", __func__, node_name);
		xfree(node_list);
		return 0;
	}
	xfree(node_list);
	debug("%s: node_number: %d", __func__, node_number);

	env_value = xstrdup(getenvp(*env_ptr, "_NEC_TASKS_PER_NODE"));
	if (!env_value) {
		error("%s: fail to get env tasks_per_ve", __func__);
		return 0;
	}
	debug("%s: tasks_per_node: %s", __func__, env_value);

	/*
	 * Retirve number of VE tasks on target VH node
	 * in environment variable _NEC_TASKS_PER_NODE.
	 */
	startp = env_value;
	do {
		errno = 0;
		ntasks = strtoul(startp, &endp, 10);
		if ((errno != 0) || (startp == endp) || (ntasks == 0)) {
			error("%s: invalid value tasks_per_node: %s: %m",
				__func__, env_value);
			xfree(env_value);
			return 0;
		}
		node_count++;
		if (*endp == '(') {
			startp = (endp + 2);
			errno = 0;
			nodes = strtoul(startp, &endp, 10);
			if ((errno != 0) || (startp == endp) || (nodes == 0)) {
				error("%s: invalid value tasks_per_node: %s: %m"
					, __func__, env_value);
				xfree(env_value);
				return 0;
			}
			for (uint64_t i = 0; i < (nodes - 1); i++) {
				if (node_count == node_number)
					break;
				node_count++;
			}
			if (*endp == ')')
				endp += 1;
		}
		if (node_count == node_number) {
			debug("%s: ntasks: %lu", __func__, ntasks);
			break;
		}
		if (*endp == ',')
			startp = (endp + 1);
		else
			startp = endp;
	} while((*endp != '\0'));
	xfree(env_value);

	return ntasks;
}

/*
 * Get number of VE tasks to run on one VE node.
 */
static uint64_t _get_ntasks_on_ve(uint64_t ntasks, uint64_t ve_num)
{
	uint64_t ntasks_on_ve, tmp1, tmp2;

	/*
	 * Return minimum integer value obtained by dividing
	 * number of VE tasks by number of VEs.
	 */
	tmp1 = (ntasks / ve_num);
	tmp2 = (ntasks % ve_num);
	if (tmp1 == 0)
		ntasks_on_ve = 1;
	else
		if (tmp2 == 0)
			ntasks_on_ve = tmp1;
		else
			ntasks_on_ve = (tmp1 + 1);

	return ntasks_on_ve;
}

/*
 * Set number of VE tasks to run on each VE nodes.
 */
static int _set_tasks_per_ve(char ***env_ptr)
{
	char *env_value = NULL, *startp = NULL, *endp = NULL;
	char *ve_list = NULL, *prefix = NULL;
	uint64_t ntasks = 0, ve_num = 0, ntasks_on_ve = 0, tasks_count = 0;

	// Get number of VE tasks to run on target VH node.
	ntasks = _get_ntasks_on_node(env_ptr);
	if (ntasks == 0) {
		error("%s: fail to get ntasks on node", __func__);
		return SLURM_ERROR;
	}

	env_value = xstrdup(getenvp(*env_ptr, "_NECMPI_VE_NUM_NODES"));
	if (!env_value) {
		error("%s: fail to get env ve_num_nodes", __func__);
		return SLURM_ERROR;
	}
	ve_num = strtoul(env_value, &endp, 10);
	debug("%s: ve_num_nodes: %s", __func__, env_value);
	xfree(env_value);

	debug("%s: ntasks: %lu", __func__, ntasks);
	debug("%s: ve_num: %lu", __func__, ve_num);
	// Get number of VE tasks to run on one VE node.
	ntasks_on_ve = _get_ntasks_on_ve(ntasks, ve_num);
	debug("%s: ntasks_on_ve: %lu", __func__, ntasks_on_ve);

	xstrfmtcat(env_value, "%"PRIu64, ntasks_on_ve);
	tasks_count = ntasks_on_ve;
	if (tasks_count < ntasks) {
		ve_list = xstrdup(getenvp(*env_ptr, "_VENODELIST"));
		if (!ve_list) {
			error("%s: fail to get env venodelist", __func__);
			xfree(env_value);
			return SLURM_ERROR;
		}
		debug("%s: ve_list: %s", __func__, ve_list);
		startp = ve_list;
		/*
		 * Set number of VE tasks to run on each VE nodes
		 * as much as number of VEs.
		 */
		while ((prefix = xstrchr(startp, ' ')) != NULL) {
			xstrcatchar(env_value, ',');
			if ((tasks_count + ntasks_on_ve) >= ntasks) {
				xstrfmtcat(env_value, "%"PRIu64,
					(ntasks - tasks_count));
				break;
			}
			xstrfmtcat(env_value, "%"PRIu64, ntasks_on_ve);
			tasks_count += ntasks_on_ve;
			startp = (prefix + 1);
		}
		xfree(ve_list);
	}

	debug("%s: tasks_per_ve: %s", __func__, env_value);
	env_array_overwrite(env_ptr, "_NEC_TASKS_PER_VE", env_value);
	xfree(env_value);

	return SLURM_SUCCESS;
}

/*
 * Write names of all VH nodes allocated to a job
 * to slurmd spool directory.
 */
static char *_write_vh_nodefile(uint64_t ntasks,
				char *tasks_per_node,
				char *nodelist,
				char *job_id)
{
	char *slurmd_spooldir = NULL, *dir_name = NULL, *file_path = NULL;
	char *startp = NULL, *endp = NULL, *host_name = NULL;
	const char *file_name = "mpinodes";
	uint64_t ntasks_on_node = 0, task_count = 0;
	hostlist_t host_list = NULL;
	FILE *fp = NULL;

	debug("%s: ntasks: %lu", __func__, ntasks);
	debug("%s: tasks_per_node: %s", __func__, tasks_per_node);
	debug("%s: nodelist: %s", __func__, nodelist);
	debug("%s: job_id: %s", __func__, job_id);

	slurmd_spooldir = xstrdup(slurm_conf.slurmd_spooldir);
	if (!slurmd_spooldir) {
		error("%s: fail to duplicate slurmd_spooldir", __func__);
		return NULL;
	}
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);

	// Make directiry for writing which is unique to each job ids.
	xstrfmtcat(dir_name, "%s/%s", slurmd_spooldir, job_id);
	errno = 0;
	if ((mkdir(dir_name, 0775) < 0) && (errno != EEXIST)) {
		error("%s: mkdir(%s): %m", __func__, dir_name);
		xfree(slurmd_spooldir);
		xfree(dir_name);
		return NULL;
	}
	xfree(slurmd_spooldir);
	debug("%s: dir_name: %s", __func__, dir_name);

	xstrfmtcat(file_path, "%s/%s", dir_name, file_name);
	if ((fp = fopen(file_path, "w+")) == NULL) {
		error("%s: fopen(%s): %m", __func__, file_path);
		xfree(dir_name);
		xfree(file_path);
		return NULL;
	}
	xfree(dir_name);
	debug("%s: file_path: %s", __func__, file_path);

	host_list = hostlist_create(nodelist);
	startp = tasks_per_node;
	while ((host_name = hostlist_shift(host_list))) {
 		// Get number of VE tasks on each VH nodes.
		ntasks_on_node = strtoul(startp, &endp, 10);
		debug("%s: host_name: %s", __func__, host_name);
		debug("%s: ntasks_on_node: %lu", __func__, ntasks_on_node);
 		// Write VH node names as much as number of VE tasks.
		for (uint64_t i = 0; i < ntasks_on_node; i++) {
			if ((task_count + i) >= ntasks)
				break;
			if (fprintf(fp, "%s\n", host_name) < 0) {
				error("%s: fprintf(%s): %m",
					__func__, file_path);
				free(host_name);
				hostlist_destroy(host_list);
				fclose(fp);
				xfree(file_path);
				return NULL;
			}
		}

		free(host_name);
		if (*endp == '\0')
			break;
		if (*endp == ',')
			startp = (endp + 1);
		task_count += ntasks_on_node;
	}
	fclose(fp);
	hostlist_destroy(host_list);

	return file_path;
}

/*
 * Set path of VH node file to the environment variable
 * SLURM_NODEFILE.
 */
static int _set_env_vh_nodefile(char ***env_ptr, bool is_job)
{
	char *env_value = NULL, *endp = NULL;
	char *tasks_per_node = NULL, *nodelist = NULL, *job_id = NULL;
	char *file_path = NULL;
	uint64_t ntasks = 0;

	if (!is_job)
		return SLURM_SUCCESS;

	env_value = xstrdup(getenvp(*env_ptr, "SLURM_NTASKS"));
	if (!env_value) {
		error("%s: fail to get env ntasks", __func__);
		return SLURM_ERROR;
	}
	errno = 0;
	ntasks = strtoul(env_value, &endp, 10);
	if ((errno != 0) || (env_value == endp) || (ntasks == 0)) {
		error("%s: invalid value ntasks: %s: %m", __func__, env_value);
		xfree(env_value);
		return SLURM_ERROR;
	}
	xfree(env_value);
	debug("%s: ntasks: %lu", __func__, ntasks);

	tasks_per_node = xstrdup(getenvp(*env_ptr, "_NEC_TASKS_PER_NODE"));
	if (!tasks_per_node) {
		error("%s: fail to get env tasks_per_node", __func__);
		return SLURM_ERROR;
	}
	debug("%s: tasks_per_node: %s", __func__, tasks_per_node);

	nodelist = xstrdup(getenvp(*env_ptr, "SLURM_NODELIST"));
	if (!nodelist) {
		error("%s: fail to get env nodelist", __func__);
		xfree(tasks_per_node);
		return SLURM_ERROR;
	}
	debug("%s: nodelist: %s", __func__, nodelist);

	job_id = xstrdup(getenvp(*env_ptr, "SLURM_JOB_ID"));
	if (!job_id) {
		error("%s: fail to get env job_id", __func__);
		xfree(tasks_per_node);
		xfree(nodelist);
		return SLURM_ERROR;
	}
	debug("%s: job_id: %s", __func__, job_id);

	/*
	 * Write names of all VH nodes to VH node file,
	 * and get the file path as return value.
	 */
	file_path = _write_vh_nodefile(ntasks, tasks_per_node, nodelist,
					job_id);

	if (!file_path) {
		error("%s: fail to write vh nodefile", __func__);
		xfree(tasks_per_node);
		xfree(nodelist);
		xfree(job_id);
		return SLURM_ERROR;
	}

	debug("%s: file_path: %s", __func__, file_path);
	env_array_overwrite(env_ptr, "SLURM_NODEFILE", file_path);

	xfree(tasks_per_node);
	xfree(nodelist);
	xfree(job_id);
	xfree(file_path);

	return SLURM_SUCCESS;
}

/*
 * Write VE numbers of all VE nodes allocated to a job or job step
 * to slurmd spool directory.
 */
static char *_write_ve_nodefile(char *tasks_per_ve,
				char *ve_list,
				char *job_id)
{
	char *slurmd_spooldir = NULL, *dir_name = NULL, *file_path = NULL;
	char *tasks_nptr = NULL, *tasks_endp = NULL;
	char *ve_nptr = NULL, *ve_endp = NULL;
	const char *file_name = "mpinodes_ve";
	uint64_t ntasks_on_ve = 0, ve_num = 0;
	FILE *fp = NULL;

	debug("%s: tasks_per_ve: %s", __func__, tasks_per_ve);
	debug("%s: ve_list: %s", __func__, ve_list);
	debug("%s: job_id: %s", __func__, job_id);

	slurmd_spooldir = xstrdup(slurm_conf.slurmd_spooldir);
	if (!slurmd_spooldir) {
		error("%s: fail to duplicate slurmd_spooldir", __func__);
		return NULL;
	}
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);

	// Make directiry for writing which is unique to each job ids.
	xstrfmtcat(dir_name, "%s/%s", slurmd_spooldir, job_id);
	errno = 0;
	if ((mkdir(dir_name, 0775) < 0) && (errno != EEXIST)) {
		error("%s: mkdir(%s): %m", __func__, dir_name);
		xfree(slurmd_spooldir);
		xfree(dir_name);
		return NULL;
	}
	xfree(slurmd_spooldir);
	debug("%s: dir_name: %s", __func__, dir_name);

	xstrfmtcat(file_path, "%s/%s", dir_name, file_name);
	if ((fp = fopen(file_path, "w+")) == NULL) {
		error("%s: fopen(%s): %m", __func__, file_path);
		xfree(dir_name);
		xfree(file_path);
		return NULL;
	}
	xfree(dir_name);
	debug("%s: file_path: %s", __func__, file_path);

	tasks_nptr = tasks_per_ve;
	ve_nptr = ve_list;
 	// Write VE numbers of all VE nodes.
	do {
		// Get number of VE tasks to run on one VE node.
		ntasks_on_ve = strtoul(tasks_nptr, &tasks_endp, 10);
		debug("%s: ntasks_on_ve: %lu", __func__, ntasks_on_ve);

		// Get VE number of target VE node.
		ve_num = strtoul(ve_nptr, &ve_endp, 10);
		debug("%s: ve_num: %lu", __func__, ve_num);

 		// Write VE number as much as number of VE tasks.
		while (ntasks_on_ve > 0) {
			if (fprintf(fp, "ve=%lu\n", ve_num) < 0) {
				error("%s: fprintf(%s): %m",
					__func__, file_path);
				xfree(file_path);
				fclose(fp);
				return NULL;
			}
			ntasks_on_ve--;
		}

		if (*tasks_endp == ',') {
			tasks_nptr = (tasks_endp + 1);
			ve_nptr = (ve_endp + 1);
		}
	} while (*tasks_endp != '\0');
	fclose(fp);

	return file_path;
}

/*
 * Set path of VE node file to the environment variable
 * SLURM_NODEFILE_VE.
 */
static void _set_env_ve_nodefile(char ***env_ptr)
{
	char *tasks_per_ve = NULL, *ve_list = NULL, *job_id = NULL;
	char *file_path = NULL;

	tasks_per_ve = xstrdup(getenvp(*env_ptr, "_NEC_TASKS_PER_VE"));
	if (!tasks_per_ve) {
		error("%s: fail to get env tasks_per_ve", __func__);
		return;
	}
	debug("%s: tasks_per_ve: %s", __func__, tasks_per_ve);

	ve_list = xstrdup(getenvp(*env_ptr, "_VENODELIST"));
	if (!ve_list) {
		error("%s: fail to get env venodelist", __func__);
		xfree(tasks_per_ve);
		return;
	}
	debug("%s: ve_list: %s", __func__, ve_list);

	job_id = xstrdup(getenvp(*env_ptr, "SLURM_JOB_ID"));
	if (!job_id) {
		error("%s: fail to get env job_id", __func__);
		xfree(tasks_per_ve);
		xfree(ve_list);
		return;
	}
	debug("%s: job_id: %s", __func__, job_id);

	/*
	 * Write VE numbers of all VE nodes allocated to a job or job step,
	 * and get the file path as return value.
	 */
	file_path = _write_ve_nodefile(tasks_per_ve, ve_list, job_id);
	if (!file_path)
		error("%s: fail to write ve nodefile", __func__);
	else {
		debug("%s: file_path: %s", __func__, file_path);
		env_array_overwrite(env_ptr, "SLURM_NODEFILE_VE", file_path);
		xfree(file_path);
	}

	xfree(tasks_per_ve);
	xfree(ve_list);
	xfree(job_id);

	return;
}

static int _set_env_mpi_ve(char ***env_ptr, bool is_job)
{
	char *env_value = NULL;

	if (is_job) {
		/*
		 * If target VH node is the 1st node, save number of VE tasks
		 * to run on each VH nodes.
		 */
		env_value = xstrdup(getenvp(*env_ptr, "SLURM_TASKS_PER_NODE"));
		if (!env_value) {
			error("%s: fail to get env tasks_per_node",
				__func__);
			return SLURM_ERROR;
		}
		debug("%s: tasks_per_node: %s", __func__, env_value);
		env_array_overwrite(env_ptr, "_NEC_TASKS_PER_NODE", env_value);
		xfree(env_value);
	}

	// Set number of VE tasks to run on each VE nodes.
	if (_set_tasks_per_ve(env_ptr) != SLURM_SUCCESS)
		return SLURM_ERROR;

	return SLURM_SUCCESS;
}

/*
 * Set environment variables for NEC MPI.
 */
static void _set_env_mpi(char ***env_ptr, char *local_list, bool is_job)
{
	char *ve_list = NULL, *prefix = NULL, *env_value = NULL;
	uint64_t ve_num = 1;

	ve_list = xstrdup(local_list);
	if (!ve_list) {
		error("%s: fail to duplicate local_list", __func__);
		return;
	}
	debug("%s: local_list: %s", __func__, local_list);

	while ((prefix = xstrchr(ve_list, ','))) {
		ve_num++;
		*prefix = ' ';
	}
	debug("%s: ve_list: %s", __func__, ve_list);

	// Set all VE numbers allocated to a job or job step.
	env_array_overwrite(env_ptr, "_NECMPI_VE_NODELIST", ve_list);

	xstrfmtcat(env_value, "%"PRIu64, ve_num);
	debug("%s: ve_num: %s", __func__, env_value);
	// Set number of VEs allocated to a job or job step.
	env_array_overwrite(env_ptr, "_NECMPI_VE_NUM_NODES", env_value);

	xfree(ve_list);
	xfree(env_value);

	if (_set_env_mpi_ve(env_ptr, is_job) != SLURM_SUCCESS)
		return;

	/*
	 * Set path of VH node file to the environment variable
	 * SLURM_NODEFILE.
	 */
	if (_set_env_vh_nodefile(env_ptr, is_job) != SLURM_SUCCESS)
		return;

	/*
	 * Set path of VE node file to the environment variable
	 * SLURM_NODEFILE_VE.
	 */
	_set_env_ve_nodefile(env_ptr);

	env_value = xstrdup("/opt/nec/ve/mpi/libexec/srun_remotehost");
	debug("%s: launcher: %s", __func__, env_value);
	env_array_overwrite(env_ptr, "NMPI_LAUNCHER_EXEC", env_value);
	xfree(env_value);

	return;
}

static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx,
		     bitstr_t *usable_gres,
		     bool *already_seen, int *local_inx,
		     bool reset, bool is_job, gres_internal_flags_t flags)
{
	char *global_list = NULL, *local_list = NULL, *slurm_env_var = NULL;

	if (is_job)
			slurm_env_var = "SLURM_JOB_VES";
	else
			slurm_env_var = "SLURM_STEP_VES";

	if (*already_seen) {
		global_list = xstrdup(getenvp(*env_ptr, slurm_env_var));
		local_list = xstrdup(getenvp(*env_ptr,
					     "VE_VISIBLE_DEVICES"));
	}

	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
			    usable_gres, "", local_inx, NULL,
			    &local_list, &global_list, reset, is_job, NULL,
			    flags);

	if (global_list) {
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
		xfree(global_list);
	}

	if (local_list) {
		env_array_overwrite(
			env_ptr, "VE_VISIBLE_DEVICES", local_list);

		// Set environment variables for ve_exec command.
		if (_set_env_ve_exec(env_ptr, local_list) == SLURM_SUCCESS) {
			// Set environment variables for ScaTeFS.
			_set_env_scatefs(env_ptr);
			// Set environment variables for NEC MPI.
			_set_env_mpi(env_ptr, local_list, is_job);
		}

		xfree(local_list);
		*already_seen = true;
	}
}

extern int init(void)
{
	debug("%s: %s loaded", __func__, plugin_name);

	return SLURM_SUCCESS;
}

extern int fini(void)
{
	debug("%s: unloading %s", __func__, plugin_name);
	FREE_NULL_LIST(gres_devices);

	return SLURM_SUCCESS;
}

/*
 * We could load gres state or validate it using various mechanisms here.
 * This only validates that the configuration was specified in gres.conf.
 * In the general case, no code would need to be changed.
 */
extern int node_config_load(List gres_conf_list, node_config_load_t *config)
{
	int rc = SLURM_SUCCESS;

	/* Assume this state is caused by an scontrol reconfigure */
	if (gres_devices) {
		debug("%s Resetting gres_devices", plugin_name);
		FREE_NULL_LIST(gres_devices);
	}

	rc = common_node_config_load(gres_conf_list, gres_name, &gres_devices);

	if (rc != SLURM_SUCCESS)
		fatal("%s failed to load configuration", plugin_name);
	else
		// Write file system information for Aurora system.
		_write_fsinfo();

	return rc;
}

/*
 * Set environment variables as appropriate for a job (i.e. all tasks) based
 * upon the job's GRES state.
 */
extern void job_set_env(char ***job_env_ptr, void *gres_ptr, int node_inx,
			gres_internal_flags_t flags)
{
	/*
	 * Variables are not static like in step_*_env since we could be calling
	 * this from the slurmd where we are dealing with a different job each
	 * time we hit this function, so we don't want to keep track of other
	 * unrelated job's status.  This can also get called multiple times
	 * (different prologs and such) which would also result in bad info each
	 * call after the first.
	 */
	int local_inx = 0;
	bool already_seen = false;

	_set_env(job_env_ptr, gres_ptr, node_inx, NULL,
		 &already_seen, &local_inx, false, true, flags);
}

/*
 * Set environment variables as appropriate for a job (i.e. all tasks) based
 * upon the job step's GRES state.
 */
extern void step_set_env(char ***step_env_ptr, void *gres_ptr,
			 gres_internal_flags_t flags)
{
	static int local_inx = 0;
	static bool already_seen = false;

	_set_env(step_env_ptr, gres_ptr, 0, NULL,
		 &already_seen, &local_inx, false, false, flags);
}

/*
 * Reset environment variables as appropriate for a job (i.e. this one task)
 * based upon the job step's GRES state and assigned CPUs.
 */
extern void step_reset_env(char ***step_env_ptr, void *gres_ptr,
			   bitstr_t *usable_gres, gres_internal_flags_t flags)
{
	static int local_inx = 0;
	static bool already_seen = false;

	_set_env(step_env_ptr, gres_ptr, 0, usable_gres,
		 &already_seen, &local_inx, true, false, flags);
}

/* Send GRES information to slurmstepd on the specified file descriptor */
extern void send_stepd(Buf buffer)
{
	common_send_stepd(buffer, gres_devices);
}

/* Receive GRES information from slurmd on the specified file descriptor */
extern void recv_stepd(Buf buffer)
{
	common_recv_stepd(buffer, &gres_devices);
}

/*
 * get data from a job's GRES data structure
 * IN job_gres_data  - job's GRES data structure
 * IN node_inx - zero-origin index of the node within the job's allocation
 *	for which data is desired
 * IN data_type - type of data to get from the job's data
 * OUT data - pointer to the data from job's GRES data structure
 *            DO NOT FREE: This is a pointer into the job's data structure
 * RET - SLURM_SUCCESS or error code
 */
extern int job_info(gres_job_state_t *job_gres_data, uint32_t node_inx,
		    enum gres_job_data_type data_type, void *data)
{
	return EINVAL;
}

/*
 * get data from a step's GRES data structure
 * IN step_gres_data  - step's GRES data structure
 * IN node_inx - zero-origin index of the node within the job's allocation
 *	for which data is desired. Note this can differ from the step's
 *	node allocation index.
 * IN data_type - type of data to get from the step's data
 * OUT data - pointer to the data from step's GRES data structure
 *            DO NOT FREE: This is a pointer into the step's data structure
 * RET - SLURM_SUCCESS or error code
 */
extern int step_info(gres_step_state_t *step_gres_data, uint32_t node_inx,
		     enum gres_step_data_type data_type, void *data)
{
	return EINVAL;
}

/*
 * Return a list of devices of this type. The list elements are of type
 * "gres_device_t" and the list should be freed using FREE_NULL_LIST().
 */
extern List get_devices(void)
{
	return gres_devices;
}

extern void step_hardware_init(bitstr_t *usable_gres, char *settings)
{
	return;
}

extern void step_hardware_fini(void)
{
	return;
}

/*
 * Build record used to set environment variables as appropriate for a job's
 * prolog or epilog based GRES allocated to the job.
 */
extern gres_epilog_info_t *epilog_build_env(gres_job_state_t *gres_job_ptr)
{
	return NULL;
}

/*
 * Set environment variables as appropriate for a job's prolog or epilog based
 * GRES allocated to the job.
 */
extern void epilog_set_env(char ***epilog_env_ptr,
			   gres_epilog_info_t *epilog_info, int node_inx)
{
	return;
}
