155 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			155 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Tests multithreading behaviour for reading and
 | 
						|
parsing files for each parser defined in parsers.py
 | 
						|
"""
 | 
						|
from contextlib import ExitStack
 | 
						|
from io import BytesIO
 | 
						|
from multiprocessing.pool import ThreadPool
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import DataFrame
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
# We'll probably always skip these for pyarrow
 | 
						|
# Maybe we'll add our own tests for pyarrow too
 | 
						|
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 | 
						|
 | 
						|
 | 
						|
def _construct_dataframe(num_rows):
 | 
						|
    """
 | 
						|
    Construct a DataFrame for testing.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    num_rows : int
 | 
						|
        The number of rows for our DataFrame.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    df : DataFrame
 | 
						|
    """
 | 
						|
    df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
 | 
						|
    df["foo"] = "foo"
 | 
						|
    df["bar"] = "bar"
 | 
						|
    df["baz"] = "baz"
 | 
						|
    df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s")
 | 
						|
    df["int"] = np.arange(num_rows, dtype="int64")
 | 
						|
    return df
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.slow
 | 
						|
def test_multi_thread_string_io_read_csv(all_parsers):
 | 
						|
    # see gh-11786
 | 
						|
    parser = all_parsers
 | 
						|
    max_row_range = 10000
 | 
						|
    num_files = 100
 | 
						|
 | 
						|
    bytes_to_df = [
 | 
						|
        "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
 | 
						|
        for _ in range(num_files)
 | 
						|
    ]
 | 
						|
 | 
						|
    # Read all files in many threads.
 | 
						|
    with ExitStack() as stack:
 | 
						|
        files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
 | 
						|
 | 
						|
        pool = stack.enter_context(ThreadPool(8))
 | 
						|
 | 
						|
        results = pool.map(parser.read_csv, files)
 | 
						|
        first_result = results[0]
 | 
						|
 | 
						|
        for result in results:
 | 
						|
            tm.assert_frame_equal(first_result, result)
 | 
						|
 | 
						|
 | 
						|
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
 | 
						|
    """
 | 
						|
    Generate a DataFrame via multi-thread.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    parser : BaseParser
 | 
						|
        The parser object to use for reading the data.
 | 
						|
    path : str
 | 
						|
        The location of the CSV file to read.
 | 
						|
    num_rows : int
 | 
						|
        The number of rows to read per task.
 | 
						|
    num_tasks : int
 | 
						|
        The number of tasks to use for reading this DataFrame.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    df : DataFrame
 | 
						|
    """
 | 
						|
 | 
						|
    def reader(arg):
 | 
						|
        """
 | 
						|
        Create a reader for part of the CSV.
 | 
						|
 | 
						|
        Parameters
 | 
						|
        ----------
 | 
						|
        arg : tuple
 | 
						|
            A tuple of the following:
 | 
						|
 | 
						|
            * start : int
 | 
						|
                The starting row to start for parsing CSV
 | 
						|
            * nrows : int
 | 
						|
                The number of rows to read.
 | 
						|
 | 
						|
        Returns
 | 
						|
        -------
 | 
						|
        df : DataFrame
 | 
						|
        """
 | 
						|
        start, nrows = arg
 | 
						|
 | 
						|
        if not start:
 | 
						|
            return parser.read_csv(
 | 
						|
                path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
 | 
						|
            )
 | 
						|
 | 
						|
        return parser.read_csv(
 | 
						|
            path,
 | 
						|
            index_col=0,
 | 
						|
            header=None,
 | 
						|
            skiprows=int(start) + 1,
 | 
						|
            nrows=nrows,
 | 
						|
            parse_dates=[9],
 | 
						|
        )
 | 
						|
 | 
						|
    tasks = [
 | 
						|
        (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
 | 
						|
    ]
 | 
						|
 | 
						|
    with ThreadPool(processes=num_tasks) as pool:
 | 
						|
        results = pool.map(reader, tasks)
 | 
						|
 | 
						|
    header = results[0].columns
 | 
						|
 | 
						|
    for r in results[1:]:
 | 
						|
        r.columns = header
 | 
						|
 | 
						|
    final_dataframe = pd.concat(results)
 | 
						|
    return final_dataframe
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.slow
 | 
						|
def test_multi_thread_path_multipart_read_csv(all_parsers):
 | 
						|
    # see gh-11786
 | 
						|
    num_tasks = 4
 | 
						|
    num_rows = 100000
 | 
						|
 | 
						|
    parser = all_parsers
 | 
						|
    file_name = "__thread_pool_reader__.csv"
 | 
						|
    df = _construct_dataframe(num_rows)
 | 
						|
 | 
						|
    with tm.ensure_clean(file_name) as path:
 | 
						|
        df.to_csv(path)
 | 
						|
 | 
						|
        final_dataframe = _generate_multi_thread_dataframe(
 | 
						|
            parser, path, num_rows, num_tasks
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(df, final_dataframe)
 |