From 969e9304c67e9fc68e1f0b93ca4415b3bd6dafa1 Mon Sep 17 00:00:00 2001 From: Admin9705 <24727006+Admin9705@users.noreply.github.com> Date: Fri, 14 Mar 2025 09:51:34 -0400 Subject: [PATCH] Update tdarr_node_scaling.sh --- tdarr_node_scaling.sh | 230 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 189 insertions(+), 41 deletions(-) diff --git a/tdarr_node_scaling.sh b/tdarr_node_scaling.sh index 926f6dc..446dd0a 100644 --- a/tdarr_node_scaling.sh +++ b/tdarr_node_scaling.sh @@ -34,18 +34,15 @@ T4_TAUTULLI_URL="" # ----------- Tdarr Settings ------------- TDARR_ALTER_WORKERS=true # If true, we adjust GPU workers; otherwise we kill container on threshold -TDARR_DEFAULT_LIMIT=5 # Default GPU workers +TDARR_DEFAULT_LIMIT=5 # Default GPU workers when watchers=0 TDARR_API_URL="http://10.0.0.10:8265" # WITHOUT /api/v2 CONTAINER_NAME="N1" # Name of your Tdarr Node Docker container -# ----------- Offset Setting ------------- -# Only Applies if >>> TDARR_ALTER_WORKERS=true - -# The number of jobs is only reduced once so many transcodes are occuring -# If set to 3 - If you have 3 transcodes, then the gpu workers reduce by 1 -# If you have 4 transcodes, then the gpu workers reduce by 2 -# If set to 0 - It will reduce gpu workers by one per transcode immediately -OFFSET_THRESHOLD=3 +# ----------- Offset Setting ------------- IF >>> TDARR_ALTER_WORKERS=true +# We only start reducing workers when watchers >= OFFSET_THRESHOLD. +# e.g. If OFFSET_THRESHOLD=3, watchers <3 => no reduction, +# watchers=3 => reduce by 1, watchers=4 => reduce by 2, etc. +OFFSET_THRESHOLD=2 # ----------- Other ------------- WAIT_SECONDS=10 # Sleep after adjustments @@ -56,6 +53,11 @@ TRANSCODE_THRESHOLD=4 # # watchers that triggers kill or reduce workers # End of configuration ################################### +# Simple logging function with timestamp +log_message() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" +} + # ------------------------------------------------------------ # Function: find_latest_node_id # ------------------------------------------------------------ @@ -80,17 +82,17 @@ ensure_node_id_loaded() { return 0 fi - echo "$(date '+%Y-%m-%d %H:%M:%S') - Attempting to retrieve nodeID from $TDARR_NODE_LOG_PATH" + log_message "Attempting to retrieve nodeID from $TDARR_NODE_LOG_PATH" local found found=$(find_latest_node_id) if [ -z "$found" ]; then - echo "ERROR: Could not find any nodeID in $TDARR_NODE_LOG_PATH." + log_message "ERROR: Could not find any nodeID in $TDARR_NODE_LOG_PATH." return 1 fi TDARR_NODE_ID="$found" - echo "$(date '+%Y-%m-%d %H:%M:%S') - Found nodeID: $TDARR_NODE_ID" + log_message "Found nodeID: $TDARR_NODE_ID" return 0 } @@ -101,15 +103,15 @@ refresh_node_id_if_changed() { local latest latest=$(find_latest_node_id) if [ -z "$latest" ]; then - echo "WARNING: Could not find any 'nodeID' lines in the log to refresh." + log_message "WARNING: Could not find any 'nodeID' lines in the log to refresh." return fi if [ "$latest" != "$TDARR_NODE_ID" ]; then - echo "NOTICE: nodeID changed from [$TDARR_NODE_ID] -> [$latest]. Updating." + log_message "NOTICE: nodeID changed from [$TDARR_NODE_ID] -> [$latest]. Updating." TDARR_NODE_ID="$latest" else - echo "NOTICE: nodeID is still the same [$TDARR_NODE_ID]." + log_message "NOTICE: nodeID is still the same [$TDARR_NODE_ID]." fi } @@ -123,15 +125,15 @@ check_single_tautulli_connection() { return 2 fi - echo "$(date '+%Y-%m-%d %H:%M:%S') - Checking Tautulli at: $url" + log_message "Checking Tautulli at: $url" local response response=$(curl -s "${url}?apikey=${api_key}&cmd=get_activity") if echo "$response" | jq . >/dev/null 2>&1; then - echo "$(date '+%Y-%m-%d %H:%M:%S') - Tautulli OK: $url" + log_message "Tautulli OK: $url" return 0 else - echo "$(date '+%Y-%m-%d %H:%M:%S') - WARNING: Could not connect or invalid JSON: $url" + log_message "WARNING: Could not connect or invalid JSON: $url" return 1 fi } @@ -143,7 +145,7 @@ check_tautulli_connections_on_startup() { # T1 must work check_single_tautulli_connection "$T1_TAUTULLI_API_KEY" "$T1_TAUTULLI_URL" if [ $? -ne 0 ]; then - echo "ERROR: T1 not reachable. Exiting." + log_message "ERROR: T1 not reachable. Exiting." exit 1 fi @@ -173,10 +175,11 @@ fetch_transcode_counts_from_tautulli() { fi local local_cnt remote_cnt + # Only count sessions that are transcoding video, not just audio local_cnt=$(echo "$resp" | jq '[.response.data.sessions[]? - | select(.transcode_decision == "transcode" and (.ip_address | startswith("10.0.0.")))] | length') + | select(.transcode_decision == "transcode" and .video_decision == "transcode" and (.ip_address | startswith("10.0.0.")))] | length') remote_cnt=$(echo "$resp" | jq '[.response.data.sessions[]? - | select(.transcode_decision == "transcode" and (.ip_address | startswith("10.0.0.") | not))] | length') + | select(.transcode_decision == "transcode" and .video_decision == "transcode" and (.ip_address | startswith("10.0.0.") | not))] | length') echo "$local_cnt $remote_cnt" } @@ -190,7 +193,7 @@ total_count=0 # Function: is_plex_transcoding_over_threshold # ------------------------------------------------------------ is_plex_transcoding_over_threshold() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - Checking Plex transcodes..." + log_message "Checking Plex transcodes..." local total_local=0 local total_remote=0 @@ -217,7 +220,7 @@ is_plex_transcoding_over_threshold() { total_count=$(( total_local + total_remote )) - echo "$(date '+%Y-%m-%d %H:%M:%S') - Found $total_local local & $total_remote remote => total=$total_count, threshold=$TRANSCODE_THRESHOLD" + log_message "Found $total_local local & $total_remote remote => total=$total_count, threshold=$TRANSCODE_THRESHOLD" # Return 0 if watchers >= threshold if [ "$total_count" -ge "$TRANSCODE_THRESHOLD" ]; then @@ -250,12 +253,20 @@ adjust_tdarr_workers() { local watchers="$1" # 1) Calculate how many watchers are above the offset - # If watchers < OFFSET_THRESHOLD => watchersOverOffset=0 - # If watchers=3 => watchersOverOffset=1 => reduce by 1 - # If watchers=4 => watchersOverOffset=2 => reduce by 2, etc. - local watchersOverOffset=$(( watchers - OFFSET_THRESHOLD + 1 )) - if [ "$watchersOverOffset" -lt 0 ]; then - watchersOverOffset=0 + # If OFFSET_THRESHOLD=0, use watchers directly + # If OFFSET_THRESHOLD>0 and watchers < OFFSET_THRESHOLD => watchersOverOffset=0 + # If OFFSET_THRESHOLD>0 and watchers=3 => watchersOverOffset=1 => reduce by 1 + # If OFFSET_THRESHOLD>0 and watchers=4 => watchersOverOffset=2 => reduce by 2, etc. + local watchersOverOffset + if [ "$OFFSET_THRESHOLD" -eq 0 ]; then + # When offset is 0, reduce by exactly the watcher count + watchersOverOffset=$watchers + else + # When offset >0, start reducing only after reaching threshold + watchersOverOffset=$(( watchers - OFFSET_THRESHOLD + 1 )) + if [ "$watchersOverOffset" -lt 0 ]; then + watchersOverOffset=0 + fi fi # 2) Desired = TDARR_DEFAULT_LIMIT - watchersOverOffset @@ -264,7 +275,7 @@ adjust_tdarr_workers() { desired=0 fi - echo "$(date '+%Y-%m-%d %H:%M:%S') - watchers=$watchers => watchersOverOffset=$watchersOverOffset => desiredWorkers=$desired" + log_message "watchers=$watchers => watchersOverOffset=$watchersOverOffset => desiredWorkers=$desired" # poll-worker-limits local poll_resp @@ -275,27 +286,27 @@ adjust_tdarr_workers() { local current current=$(echo "$poll_resp" | jq '.workerLimits.transcodegpu' 2>/dev/null) if [ -z "$current" ] || [ "$current" = "null" ]; then - echo "ERROR: Could not retrieve current GPU worker limit for nodeID='$TDARR_NODE_ID'. Will re-check log for a new ID." + log_message "ERROR: Could not retrieve current GPU worker limit for nodeID='$TDARR_NODE_ID'. Will re-check log for a new ID." refresh_node_id_if_changed return fi - echo "$(date '+%Y-%m-%d %H:%M:%S') - Current GPU worker limit: $current" + log_message "Current GPU worker limit: $current" local diff=$(( desired - current )) if [ "$diff" -eq 0 ]; then - echo "Already at the desired GPU worker limit ($desired)." + log_message "Already at the desired GPU worker limit ($desired)." return fi local step if [ "$diff" -gt 0 ]; then step="increase" - echo "Need to increase by $diff" + log_message "Need to increase by $diff" else step="decrease" diff=$(( -diff )) - echo "Need to decrease by $diff" + log_message "Need to decrease by $diff" fi local i=0 @@ -308,7 +319,7 @@ adjust_tdarr_workers() { sleep 1 done - echo "$(date '+%Y-%m-%d %H:%M:%S') - GPU worker limit adjustment complete." + log_message "GPU worker limit adjustment complete." } # ------------------------------------------------------------ @@ -317,34 +328,171 @@ adjust_tdarr_workers() { ensure_node_id_loaded check_tautulli_connections_on_startup +# Main loop with protection against duplicate operations +last_operation="" +last_gpu_limit=0 +consecutive_duplicates=0 + +# Set initial GPU workers on startup +if [ "$TDARR_ALTER_WORKERS" = "true" ]; then + log_message "Setting initial GPU workers to default limit: $TDARR_DEFAULT_LIMIT on startup" + + # Ensure we have nodeID before trying to set workers + ensure_node_id_loaded || { + log_message "ERROR: Could not get nodeID, can't set initial GPU workers" + sleep 5 # Wait a bit and continue, will try again in the main loop + } + + if [ -n "$TDARR_NODE_ID" ]; then + # Get current limit + current_limit=$(curl -s -X POST "${TDARR_API_URL}/api/v2/poll-worker-limits" \ + -H "Content-Type: application/json" \ + -d '{"data":{"nodeID":"'"$TDARR_NODE_ID"'"}}' | \ + jq '.workerLimits.transcodegpu' 2>/dev/null) + + if [ -n "$current_limit" ] && [ "$current_limit" != "null" ]; then + # Calculate how many workers to add/remove + diff=$(( TDARR_DEFAULT_LIMIT - current_limit )) + + if [ "$diff" -ne 0 ]; then + step="" + count=0 + + if [ "$diff" -gt 0 ]; then + step="increase" + count=$diff + log_message "Need to increase by $diff to reach default limit" + else + step="decrease" + count=$(( -diff )) + log_message "Need to decrease by $(( -diff )) to reach default limit" + fi + + i=0 + while [ $i -lt $count ]; do + curl -s -X POST "${TDARR_API_URL}/api/v2/alter-worker-limit" \ + -H "Content-Type: application/json" \ + -d '{"data":{"nodeID":"'"$TDARR_NODE_ID"'","process":"'"$step"'","workerType":"transcodegpu"}}' \ + >/dev/null 2>&1 + i=$(( i + 1 )) + sleep 1 + done + + log_message "Initial GPU worker limit set to $TDARR_DEFAULT_LIMIT" + else + log_message "GPU workers already at desired default limit: $current_limit" + fi + else + log_message "ERROR: Could not get current GPU worker limit" + fi + fi +fi + while true; do if is_plex_transcoding_over_threshold; then # watchers >= threshold if [ "$TDARR_ALTER_WORKERS" = "true" ]; then - echo "Threshold exceeded. Reducing GPU workers." + # Check if we're doing the same operation repeatedly + operation="reduce_workers_$total_count" + + # Get current limit to check if it changed + current_limit=$(curl -s -X POST "${TDARR_API_URL}/api/v2/poll-worker-limits" \ + -H "Content-Type: application/json" \ + -d '{"data":{"nodeID":"'"$TDARR_NODE_ID"'"}}' | \ + jq '.workerLimits.transcodegpu' 2>/dev/null) + + if [ "$operation" = "$last_operation" ] && [ "$current_limit" = "$last_gpu_limit" ]; then + consecutive_duplicates=$((consecutive_duplicates + 1)) + if [ $consecutive_duplicates -gt 2 ]; then + log_message "Skipping duplicate worker adjustment (done $consecutive_duplicates times already)" + sleep "$WAIT_SECONDS" + continue + fi + else + consecutive_duplicates=0 + fi + + last_operation="$operation" + last_gpu_limit="$current_limit" + + log_message "Threshold exceeded. Reducing GPU workers." adjust_tdarr_workers "$total_count" sleep "$WAIT_SECONDS" else # kill container + operation="kill_container" + + if [ "$operation" = "$last_operation" ]; then + consecutive_duplicates=$((consecutive_duplicates + 1)) + if [ $consecutive_duplicates -gt 2 ]; then + log_message "Skipping duplicate container management (done $consecutive_duplicates times already)" + sleep "$WAIT_SECONDS" + continue + fi + else + consecutive_duplicates=0 + fi + + last_operation="$operation" + if is_container_running; then - echo "Threshold exceeded: Killing $CONTAINER_NAME" + log_message "Threshold exceeded: Killing $CONTAINER_NAME" docker kill "$CONTAINER_NAME" else - echo "$CONTAINER_NAME is already stopped." + log_message "$CONTAINER_NAME is already stopped." fi sleep "$WAIT_SECONDS" fi else # watchers < threshold if [ "$TDARR_ALTER_WORKERS" = "true" ]; then + # Check if we're doing the same operation repeatedly + operation="adjust_workers_$total_count" + + # Get current limit to check if it changed + current_limit=$(curl -s -X POST "${TDARR_API_URL}/api/v2/poll-worker-limits" \ + -H "Content-Type: application/json" \ + -d '{"data":{"nodeID":"'"$TDARR_NODE_ID"'"}}' | \ + jq '.workerLimits.transcodegpu' 2>/dev/null) + + if [ "$operation" = "$last_operation" ] && [ "$current_limit" = "$last_gpu_limit" ]; then + consecutive_duplicates=$((consecutive_duplicates + 1)) + if [ $consecutive_duplicates -gt 2 ]; then + log_message "Skipping duplicate worker adjustment (done $consecutive_duplicates times already)" + sleep "$BASIC_CHECK" + continue + fi + else + consecutive_duplicates=0 + fi + + last_operation="$operation" + last_gpu_limit="$current_limit" + adjust_tdarr_workers "$total_count" fi + # Start container if needed + operation="start_container" + + if [ "$operation" = "$last_operation" ] && is_container_running; then + consecutive_duplicates=$((consecutive_duplicates + 1)) + if [ $consecutive_duplicates -gt 2 ]; then + log_message "Skipping duplicate container check (done $consecutive_duplicates times already)" + sleep "$BASIC_CHECK" + continue + fi + else + consecutive_duplicates=0 + fi + + last_operation="$operation" + if ! is_container_running; then - echo "Below threshold -> Starting container $CONTAINER_NAME." + log_message "Below threshold -> Starting container $CONTAINER_NAME." docker start "$CONTAINER_NAME" else - echo "Container $CONTAINER_NAME is already running." + log_message "Container $CONTAINER_NAME is already running." fi sleep "$BASIC_CHECK"