Textract Handler API¶

auris_tools.textractHandler ¶

TextractHandler ¶

Handler for Amazon Textract operations to extract text from documents.

This class provides methods to interact with AWS Textract service for text extraction from documents stored in S3.

Source code in auris_tools/textractHandler.py

class TextractHandler:
    """
    Handler for Amazon Textract operations to extract text from documents.

    This class provides methods to interact with AWS Textract service for
    text extraction from documents stored in S3.
    """

    def __init__(self, config=None):
        """
        Initialize the Textract handler with AWS configuration.

        Args:
            config: An AWSConfiguration object, or None to use environment variables
        """
        if config is None:
            config = AWSConfiguration()

        # Create a boto3 session with the configuration
        session = boto3.session.Session(**config.get_boto3_session_args())

        # Create a Textract client with additional configuration if needed
        self.client = session.client('textract', **config.get_client_args())
        logging.info(f'Initialized Textract client in region {config.region}')

    def start_job(self, s3_bucket_name, object_name):
        """
        Start an asynchronous text detection job for a document in S3.

        Args:
            s3_bucket_name: Name of the S3 bucket containing the document
            object_name: Object key of the document in the S3 bucket

        Returns:
            str: The JobId of the started Textract job

        Raises:
            Exception: If there is an error starting the job
        """
        try:
            response = self.client.start_document_text_detection(
                DocumentLocation={
                    'S3Object': {'Bucket': s3_bucket_name, 'Name': object_name}
                }
            )
            job_id = response['JobId']
            logging.info(
                f'Started Textract job {job_id} for {s3_bucket_name}/{object_name}'
            )
            return job_id
        except Exception as e:
            logging.error(
                f'Error starting Textract job for {s3_bucket_name}/{object_name}: {str(e)}'
            )
            raise

    def get_job_status(self, job_id):
        """
        Get the status of a Textract job.

        Args:
            job_id: ID of the Textract job

        Returns:
            str: The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')
        """
        try:
            response = self.client.get_document_text_detection(JobId=job_id)
            status = response['JobStatus']
            logging.info(f'Textract job {job_id} status: {status}')
            return status
        except Exception as e:
            logging.error(
                f'Error getting status for Textract job {job_id}: {str(e)}'
            )
            raise

    def is_job_complete(self, job_id):
        """
        Check if a Textract job has completed.

        Args:
            job_id: ID of the Textract job

        Returns:
            str: The job status
        """
        time.sleep(1)  # Avoid rate limiting
        return self.get_job_status(job_id)

    def get_job_results(self, job_id):
        """
        Get the results of a completed Textract job.

        This method handles pagination of results automatically.

        Args:
            job_id: ID of the Textract job

        Returns:
            list: List of response pages from Textract
        """
        pages = []
        next_token = None

        try:
            # Get first page
            response = self.client.get_document_text_detection(JobId=job_id)
            pages.append(response)
            logging.info(f'Received page 1 of results for job {job_id}')

            # Get next token if available
            if 'NextToken' in response:
                next_token = response['NextToken']

            # Get additional pages if available
            page_num = 2
            while next_token:
                time.sleep(1)  # Avoid rate limiting
                response = self.client.get_document_text_detection(
                    JobId=job_id, NextToken=next_token
                )
                pages.append(response)
                logging.info(
                    f'Received page {page_num} of results for job {job_id}'
                )
                page_num += 1

                next_token = response.get('NextToken')

            return pages
        except Exception as e:
            logging.error(
                f'Error getting results for Textract job {job_id}: {str(e)}'
            )
            raise

    def get_full_text(self, response):
        """
        Extract the full text from Textract response pages.

        Args:
            response: List of response pages from Textract

        Returns:
            str: The full extracted text as a string
        """
        try:
            text_lines = []
            for result_page in response:
                for item in result_page.get('Blocks', []):
                    if item.get('BlockType') == 'LINE':
                        text_lines.append(item.get('Text', ''))

            full_text = ' '.join(text_lines)
            return full_text
        except Exception as e:
            logging.error(
                f'Error extracting full text from Textract response: {str(e)}'
            )
            return ''

init ¶

__init__(config = None)

Initialize the Textract handler with AWS configuration.

Parameters:

Name	Type	Description	Default
`config`		An AWSConfiguration object, or None to use environment variables	`None`

Source code in auris_tools/textractHandler.py

def __init__(self, config=None):
    """
    Initialize the Textract handler with AWS configuration.

    Args:
        config: An AWSConfiguration object, or None to use environment variables
    """
    if config is None:
        config = AWSConfiguration()

    # Create a boto3 session with the configuration
    session = boto3.session.Session(**config.get_boto3_session_args())

    # Create a Textract client with additional configuration if needed
    self.client = session.client('textract', **config.get_client_args())
    logging.info(f'Initialized Textract client in region {config.region}')

get_full_text ¶

get_full_text(response)

Extract the full text from Textract response pages.

Parameters:

Name	Type	Description	Default
`response`		List of response pages from Textract	required

Returns:

Name	Type	Description
`str`		The full extracted text as a string

Source code in auris_tools/textractHandler.py

def get_full_text(self, response):
    """
    Extract the full text from Textract response pages.

    Args:
        response: List of response pages from Textract

    Returns:
        str: The full extracted text as a string
    """
    try:
        text_lines = []
        for result_page in response:
            for item in result_page.get('Blocks', []):
                if item.get('BlockType') == 'LINE':
                    text_lines.append(item.get('Text', ''))

        full_text = ' '.join(text_lines)
        return full_text
    except Exception as e:
        logging.error(
            f'Error extracting full text from Textract response: {str(e)}'
        )
        return ''

get_job_results ¶

get_job_results(job_id)

Get the results of a completed Textract job.

This method handles pagination of results automatically.

Parameters:

Name	Type	Description	Default
`job_id`		ID of the Textract job	required

Returns:

Name	Type	Description
`list`		List of response pages from Textract

Source code in auris_tools/textractHandler.py

def get_job_results(self, job_id):
    """
    Get the results of a completed Textract job.

    This method handles pagination of results automatically.

    Args:
        job_id: ID of the Textract job

    Returns:
        list: List of response pages from Textract
    """
    pages = []
    next_token = None

    try:
        # Get first page
        response = self.client.get_document_text_detection(JobId=job_id)
        pages.append(response)
        logging.info(f'Received page 1 of results for job {job_id}')

        # Get next token if available
        if 'NextToken' in response:
            next_token = response['NextToken']

        # Get additional pages if available
        page_num = 2
        while next_token:
            time.sleep(1)  # Avoid rate limiting
            response = self.client.get_document_text_detection(
                JobId=job_id, NextToken=next_token
            )
            pages.append(response)
            logging.info(
                f'Received page {page_num} of results for job {job_id}'
            )
            page_num += 1

            next_token = response.get('NextToken')

        return pages
    except Exception as e:
        logging.error(
            f'Error getting results for Textract job {job_id}: {str(e)}'
        )
        raise

get_job_status ¶

get_job_status(job_id)

Get the status of a Textract job.

Parameters:

Name	Type	Description	Default
`job_id`		ID of the Textract job	required

Returns:

Name	Type	Description
`str`		The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')

Source code in auris_tools/textractHandler.py

def get_job_status(self, job_id):
    """
    Get the status of a Textract job.

    Args:
        job_id: ID of the Textract job

    Returns:
        str: The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')
    """
    try:
        response = self.client.get_document_text_detection(JobId=job_id)
        status = response['JobStatus']
        logging.info(f'Textract job {job_id} status: {status}')
        return status
    except Exception as e:
        logging.error(
            f'Error getting status for Textract job {job_id}: {str(e)}'
        )
        raise

is_job_complete ¶

is_job_complete(job_id)

Check if a Textract job has completed.

Parameters:

Name	Type	Description	Default
`job_id`		ID of the Textract job	required

Returns:

Name	Type	Description
`str`		The job status

Source code in auris_tools/textractHandler.py

def is_job_complete(self, job_id):
    """
    Check if a Textract job has completed.

    Args:
        job_id: ID of the Textract job

    Returns:
        str: The job status
    """
    time.sleep(1)  # Avoid rate limiting
    return self.get_job_status(job_id)

start_job ¶

start_job(s3_bucket_name, object_name)

Start an asynchronous text detection job for a document in S3.

Parameters:

Name	Type	Description	Default
`s3_bucket_name`		Name of the S3 bucket containing the document	required
`object_name`		Object key of the document in the S3 bucket	required

Returns:

Name	Type	Description
`str`		The JobId of the started Textract job

Raises:

Type	Description
`Exception`	If there is an error starting the job

Source code in auris_tools/textractHandler.py

def start_job(self, s3_bucket_name, object_name):
    """
    Start an asynchronous text detection job for a document in S3.

    Args:
        s3_bucket_name: Name of the S3 bucket containing the document
        object_name: Object key of the document in the S3 bucket

    Returns:
        str: The JobId of the started Textract job

    Raises:
        Exception: If there is an error starting the job
    """
    try:
        response = self.client.start_document_text_detection(
            DocumentLocation={
                'S3Object': {'Bucket': s3_bucket_name, 'Name': object_name}
            }
        )
        job_id = response['JobId']
        logging.info(
            f'Started Textract job {job_id} for {s3_bucket_name}/{object_name}'
        )
        return job_id
    except Exception as e:
        logging.error(
            f'Error starting Textract job for {s3_bucket_name}/{object_name}: {str(e)}'
        )
        raise

Textract Handler API¶

auris_tools.textractHandler ¶

TextractHandler ¶

__init__ ¶

get_full_text ¶

get_job_results ¶

get_job_status ¶

is_job_complete ¶

start_job ¶

init ¶