Skip to content

Commit

Permalink
Benchmarking script draft
Browse files Browse the repository at this point in the history
  • Loading branch information
AndriiPovsten committed Jul 10, 2024
1 parent fc70577 commit c699745
Showing 1 changed file with 145 additions and 0 deletions.
145 changes: 145 additions & 0 deletions scripts/logs_lifetime_benchmarking_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, SecondLocator
import click

"""Run this script to generate the plots of current job status"""

def parse_log_file(file_path):
with open(file_path, 'r') as f:
lines = f.readlines()
return lines

def extract_unique_jobs(lines):
unique_jobs = {}
for line in lines:
if line.strip().startswith('reana-run-job-'):
job_id = line.strip().split()[0]
unique_jobs[job_id] = line.strip()
return unique_jobs.values()

def extract_succeeded_timestamps(unique_jobs):
succeeded_timestamps = [line.split()[5] for line in unique_jobs if line.split()[5] != "<none>"]
succeeded_timestamps = [ts.split(',')[0] for ts in succeeded_timestamps]
return pd.to_datetime(succeeded_timestamps, errors='coerce')

def get_sorted_data(lines):
sorted_data = sorted(lines, key=lambda x: x.split()[1])
return sorted_data

def filter_jobs(sorted_data, status):
return [line for line in sorted_data if line.split()[1] == status]

def extract_running_timestamps(running_jobs):
timestamps_running = []
encountered_jobs_running = set()
for line in running_jobs:
parts = line.split()
job_id = parts[0]
if job_id in encountered_jobs_running:
continue
start_time = pd.to_datetime(parts[3])
finish_time_str = parts[5].split(',')[0]
if finish_time_str != '<none>':
finish_time = pd.to_datetime(finish_time_str)
timestamps_running.append((start_time, 1))
timestamps_running.append((finish_time, -1))
encountered_jobs_running.add(job_id)
timestamps_running.sort()
return timestamps_running

def extract_pending_timestamps(pending_jobs):
timestamps_pending = []
encountered_jobs_pending = set()
for line in pending_jobs:
parts = line.split()
job_id = parts[0]
if job_id in encountered_jobs_pending:
continue
start_time_str = parts[2]
if start_time_str == '<none>':
continue
start_time = pd.to_datetime(start_time_str)
finish_time_str = parts[3].split(',')[0]
if finish_time_str != '<none>':
finish_time = pd.to_datetime(finish_time_str)
timestamps_pending.append((start_time, 1))
timestamps_pending.append((finish_time, -1))
encountered_jobs_pending.add(job_id)
timestamps_pending.sort()
return timestamps_pending

def calculate_cumulative(timestamps):
x = []
y = []
cumulative_sum = 0
for timestamp in timestamps:
cumulative_sum += timestamp[1]
x.append(timestamp[0])
y.append(cumulative_sum)
return x, y

def plot_data(succeeded_counts, x_running, y_running, x_pending, y_pending):
"""Run benchmarking tests. Generate matplotlib plot
The script requires matplotlib and pandas packages
Steps to run benchmarking workflow lifetime test:
.. code-block:: console
\b
#To run this script
$ kubectl #To save a live logs
$ cd reana/scripts
$ python lifetime.py logs.txt # insert your .txt file with logs
"""
plt.figure(figsize=figsize)

# Plot succeeded jobs
plt.plot(succeeded_counts.index, succeeded_counts.cumsum(), label='Finished', linestyle='-', color='green', alpha=0.5)

# Plot running jobs
plt.plot(x_running, y_running, linestyle='-', color='blue', alpha=0.5, linewidth=3, label='Running')

# Plot pending jobs
plt.plot(x_pending, y_pending, linestyle='-', color='orange', alpha=0.5, linewidth=3, label='Pending')

plt.xlabel('Processing time')
plt.ylabel('Number of Jobs')
plt.title(title)
plt.gca().xaxis.set_major_formatter(DateFormatter("%H:%M:%S"))
plt.gca().xaxis.set_major_locator(SecondLocator(interval=40))
plt.grid(True)
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

@click.command()
@click.argument('file_path')
@click.option('--title', default='Analysis Results', help='Title of the analysis results')
@click.option('--figsize', nargs=2, type=float, default=(12, 8), help='Figure size as two float values')
def main(file_path):
lines = parse_log_file(file_path)

unique_jobs = extract_unique_jobs(lines)
succeeded_timestamps = extract_succeeded_timestamps(unique_jobs)
first_succeeded_timestamp = succeeded_timestamps.dropna().min()
succeeded_counts = succeeded_timestamps.value_counts().sort_index()

sorted_data = get_sorted_data(lines)

running_jobs = filter_jobs(sorted_data, 'Running')
timestamps_running = extract_running_timestamps(running_jobs)
x_running, y_running = calculate_cumulative(timestamps_running)

pending_jobs = filter_jobs(sorted_data, 'Pending')
timestamps_pending = extract_pending_timestamps(pending_jobs)
x_pending, y_pending = calculate_cumulative(timestamps_pending)

plot_data(succeeded_counts, x_running, y_running, x_pending, y_pending)

if __name__ == "__main__":
main()

0 comments on commit c699745

Please sign in to comment.