Source code for apcloudy.models

"""
This module defines classes and functionalities to handle scraping jobs,
spiders, and projects. It includes data representations for job execution
states, jobs, spiders, and projects.

The module provides detailed representations and helper methods to create
instances of these classes from dictionaries, typically corresponding to API
responses. It also includes properties and static methods to parse and format
relevant data.

Classes:
  - JobState: Enum for job execution states.
  - Job: Represents a scraping job, its attributes, and utility methods.
  - Spider: Represents a spider and its configurations.
  - Project: Represents a project and associated metadata.
"""

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any
from datetime import datetime
from enum import Enum

from tabulate import tabulate

from .config import config


[docs] class JobState(Enum): """ Represents the state of a job in a task or workflow management system. This enumeration is used to define and manage the state of a job. It supports several states a job may transition through during its lifecycle, which can facilitate job tracking, control, and monitoring in various systems. Typical states include when a job is scheduled, actively running, completed, or deleted. """ SCHEDULED = "scheduled" RUNNING = "running" COMPLETED = "completed" DELETED = "deleted"
[docs] @dataclass class Job: """ Represents a job execution and maintains information related to job lifecycle, metrics, and associated resources. This class is used for tracking the progress, state, and details of a specific job. It can manage metadata such as creation time, start time, finish time, and other attributes that describe the job's execution process. :ivar job_id: Unique identifier for the job assigned by the system. :type job_id: str :ivar spider_name: Name of the spider used to execute the job. :type spider_name: str :ivar state: Current state of the job, represented as a JobState instance. :type state: JobState :ivar project_id: Identifier of the project the job belongs to. :type project_id: str :ivar created_at: Timestamp when the job was created, or None if not available. :type created_at: Optional[datetime] :ivar started_at: Timestamp when the job was started, or None if not available. :type started_at: Optional[datetime] :ivar finished_at: Timestamp when the job was finished, or None if not available. :type finished_at: Optional[datetime] :ivar items_scraped: Total number of items successfully scraped by the job. :type items_scraped: int :ivar requests_made: Total number of requests made during the job execution. :type requests_made: int :ivar job_args: Dictionary of additional arguments or configuration parameters passed to the job. :type job_args: Dict[str, Any] :ivar units: Number of resource units used (e.g., processing capacity) by the job. :type units: int :ivar logs_url: URL containing logs associated with the job, or None if not set. :type logs_url: Optional[str] :ivar items_url: URL containing scraped items for the job, or None if not set. :type items_url: Optional[str] """ job_id: str spider_name: str state: JobState project_id: str = "" created_at: Optional[datetime] = None started_at: Optional[datetime] = None finished_at: Optional[datetime] = None items_scraped: int = 0 requests_made: int = 0 job_args: Dict[str, Any] = field(default_factory=dict) units: int = 1 logs_url: Optional[str] = None items_url: Optional[str] = None
[docs] @classmethod def from_dict(cls, data: List[Dict[str, Any]]) -> List['Job']: """ Creates a Job instance from a dictionary representation and displays the job details in a tabulated format. This method is primarily responsible for deserializing structured data into a Job instance and setting attributes accordingly. Additionally, it formats and prints job details like job ID, spider name, state, and timestamps in an organized layout. :param data: Dictionary containing the job data. :type data: Dict[str, Any] :return: A Job instance populated from the given data. :rtype: Job """ jobs = [] row_data = [] for d in data: job = cls( job_id=d['job_id'], spider_name=d['spider_name'], state=JobState(d['status']), project_id=d.get('project_id', ''), created_at=cls._parse_datetime(d.get('created_at')), started_at=cls._parse_datetime(d.get('started_at')), finished_at=cls._parse_datetime(d.get('finished_at')), items_scraped=d.get('items_scraped', 0), requests_made=d.get('requests_made', 0), job_args=d.get('job_args', {}), units=d.get('units', 1), logs_url=d.get('logs_url'), items_url=d.get('items_url'), ) jobs.append(job) config.current_job_id = d['job_id'] row_data.append([ job.job_id, job.spider_name, job.state.value, job.project_id or "N/A", job.created_at.strftime("%Y-%m-%d %H:%M:%S") if job.created_at else "N/A", job.started_at.strftime("%Y-%m-%d %H:%M:%S") if job.started_at else "N/A", job.finished_at.strftime("%Y-%m-%d %H:%M:%S") if job.finished_at else "N/A", job.items_scraped, job.requests_made, job.units, f"{job.duration:.2f}s" if job.duration else "N/A", ]) # Display job data in table format with columns headers = ["Job ID", "Spider Name", "State", "Project ID", "Created At", "Started At", "Finished At", "Items", "Requests", "Units", "Duration"] print(tabulate(row_data, headers=headers, tablefmt="grid")) return jobs
@staticmethod def _parse_datetime(dt_str: Optional[str]) -> Optional[datetime]: """ Parses an ISO 8601 formatted date-time string into a ``datetime`` object. This static method attempts to parse the given date-time string. It handles ISO 8601 formats and ensures compatibility with UTC by replacing any 'Z' suffix in the string with '+00:00'. If the string is invalid or cannot be processed, the method returns ``None``. :param dt_str: The date-time string to be parsed. If provided, it must follow the ISO 8601 format. If ``None`` or empty, the method returns ``None``. :type dt_str: Optional[str] :return: A ``datetime`` object representing the parsed date-time, or ``None`` if the input is invalid, empty, or cannot be processed. :rtype: Optional[datetime] """ if not dt_str: return None try: return datetime.fromisoformat(dt_str.replace('Z', '+00:00')) except (ValueError, AttributeError): return None @property def duration(self) -> Optional[float]: """Get job duration in seconds""" if self.started_at and self.finished_at: return (self.finished_at - self.started_at).total_seconds() return None
[docs] @dataclass class Spider: """Represents a spider""" name: str description: str = "" project_id: str = "" settings: Dict[str, Any] = field(default_factory=dict)
[docs] @classmethod def from_dict(cls, data: List[Dict[str, Any]]) -> List['Spider']: """Create Spider instance from API response""" spider: List[Spider] = [] row_data = [] # Display spider data in table format with columns headers = ["S.No", "Name", "Description", "Project ID"] for idx, sp in enumerate(data, start=1): row_data.append([idx, sp.get('name'), sp.get('description'), sp.get('project_id'), ]) spider.append(cls( name=sp['name'], description=sp.get('description', ''), project_id=sp.get('project_id', ''), )) print("SPIDER DETAILS") print(tabulate(row_data, headers=headers, tablefmt="grid")) return spider
[docs] @dataclass class Project: """Represents a project""" project_id: str org_name: str name: str description: str = "" created_at: Optional[datetime] = None spider_count: int = 0 job_count: int = 0
[docs] @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'Project': """Create Project instance from API response""" project = cls( project_id=data['project_id'], org_name=data['organization_name'], name=data['name'], description=data.get('description', ''), created_at=Job._parse_datetime(data.get('created_at')), spider_count=data.get('spider_count', 0), job_count=data.get('job_count', 0) ) headers = ["ID", "Org Name", "Name", "Description", "Created At", "Spider Count", "Job Count"] row_data = [ project.project_id, project.org_name, project.name, project.description or "N/A", project.created_at.strftime("%Y-%m-%d %H:%M:%S") if project.created_at else "N/A", project.spider_count, project.job_count ] print("PROJECT DETAILS") print(tabulate([row_data], headers=headers, tablefmt="grid")) return project