Source code for adaptdl.env

# Copyright 2020 Petuum, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module contains functions for retrieving the values of AdaptDL
environment variables, or their defaults if unset.
"""

import os


[docs]def checkpoint_path(): """ Path to the directory used for saving and loading checkpoints. Determined by the environment variable ``ADAPTDL_CHECKPOINT_PATH``, or ``None`` if unset. Setting this environment variable is required for checkpointing, and is automatically set in AdaptDL-scheduled clusters. Returns: str: checkpoint path or ``None``. """ return os.getenv("ADAPTDL_CHECKPOINT_PATH")
[docs]def share_path(): """ Path to a directory shared by all AdaptDL job replicas, which can be used by the application, e.g. for storing downloaded datasets or artifacts. Determined by the environment variable ``ADAPTDL_SHARE_PATH``, or ``None`` if unset. Automatically set in AdaptDL-scheduled clusters. Returns: str: shared directory path or ``None``. """ return os.getenv("ADAPTDL_SHARE_PATH")
[docs]def job_id(): """ A string which uniquely identifies the current job in an AdaptDL-scheduled cluster. ``None`` if running standalone. Returns: str: unique job identifier or ``None``. """ return os.getenv("ADAPTDL_JOB_ID")
[docs]def master_addr(): """ Network address of the rank 0 replica, required for distributed training. Determined by the environment variable ``ADAPTDL_MASTER_ADDR``, or `0.0.0.0` if unset. In AdaptDL-scheduled clusters, this environment variable is unset. The rank 0 replica is discovered dynamically by querying the supervisor (:func:`supervisor_url`). Returns: str: address of the rank 0 replica, or `0.0.0.0`. """ return os.getenv("ADAPTDL_MASTER_ADDR", "0.0.0.0")
[docs]def master_port(): """ Available port for the rank 0 replica, required for distributed training. Determined by the environment variable ``ADAPTDL_MASTER_PORT``, or 0 if unset. Automatically set in AdaptDL-scheduled clusters. Returns: int: available port for the rank 0 replica, or 0. """ return int(os.getenv("ADAPTDL_MASTER_PORT", "0"))
[docs]def replica_rank(): """ Rank of the current replica, required for distributed training. Each replica is assigned a unique rank from 0 to K-1, where K is the total number of replicas. Determined by the environment variable ``ADAPTDL_REPLICA_RANK``, or 0 if unset. Automatically set in AdaptDL-scheduled clusters. Returns: int: rank of the current replica, or 0. """ return int(os.getenv("ADAPTDL_REPLICA_RANK", "0"))
[docs]def num_nodes(): """ Number of unique nodes being used for the current job. For example, if there are 4 nodes, each running 2 replicas, then this function returns 4. Determined by the environment variable ``ADAPTDL_NUM_NODES``, or is equal to :func:`num_replicas` if unset. Thus, this environment variable only needs to be set if some node runs multiple replicas. Automatically set in AdaptDL-scheduled clusters. Returns: int: number of unique nodes, or the value of :func:`num_replicas`. """ return int(os.getenv("ADAPTDL_NUM_NODES", num_replicas()))
[docs]def num_replicas(): """ Total number of replicas, required for distributed training. For example, if there are 4 nodes, each running 2 replicas, then this function returns 8. Determined by the environment variable ``ADAPTDL_NUM_REPLICAS``, or 1 if unset. Automatically set in AdaptDL-scheduled clusters. Returns: int: total number of replicas, or 1. """ return int(os.getenv("ADAPTDL_NUM_REPLICAS", "1"))
[docs]def num_restarts(): """ Number of times the current job was restarted. Determined by the environment variable ``ADAPTDL_NUM_RESTARTS``, or 0 if unset. This value is mainly informational, and is automatically set in AdaptDL-scheduled clusters. Returns: int: number of restarts, or 0. """ return int(os.getenv("ADAPTDL_NUM_RESTARTS", "0"))
[docs]def adaptdl_sched_version(): """ A string which gives the AdaptDL version of scheduler. Determined by the environment variable ``ADAPTDL_SCHED_VERSION`` or ``None`` Returns: str: AdaptDL version of scheduler, or ``None``. """ return os.environ.get("ADAPTDL_SCHED_VERSION")
[docs]def supervisor_url(): """ URL of the supervisor in an AdaptDL-scheduled cluster. The address of the rank 0 replica is dynamically discovered via the supervisor, instead of via the ``ADAPTDL_MASTER_ADDR`` environment variable. Returns: str: URL of the supervisor, or ``None``. """ return os.getenv("ADAPTDL_SUPERVISOR_URL")
[docs]def from_ray(): """ Returns True if the code is being called from Ray""" if os.getenv("ADAPTDL_TUNE_TRIAL_SCHED", "False") == "True": import ray return ray.is_initialized() else: return False