o
    hO]                     @   s4  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZmZmZmZmZmZ dZdZdZdZeeeeef f Zed	eeef f Zer`dd
lm Z  eefdedededefddZ!efdedededefddZ"dedededededefddZ#efdede$defddZ%efdede$defdd Z&	d8dedededefd!d"Z'd#ede	e fd$d%Z(d&e	e de	e	e  fd'd(Z)G d)d* d*e*Z+G d+d, d,e+Z,G d-d. d.e+Z-G d/d0 d0e*Z.g d1Z/g d2Z0G d3d4 d4e1Z2e2d Z3eG d5d	 d	Z4G d6d7 d7e*Z5dS )9    N)	dataclass)
itemgetter)	TYPE_CHECKINGAnyDictListOptionalSetTupleTypeUnion   )utils)T_bboxT_numT_obj
T_obj_iter
T_obj_listT_point   TableSettings)Pageedgesx_tolerancey_tolerancereturnc                 C   sR   g g d}| D ]}||d   | qt|d d|}t|d d|}|| S )zs
    Given a list of edges, snap any within `tolerance` pixels of one another
    to their positional average.
    vhorientationr   x0r   top)appendr   Zsnap_objects)r   r   r   Zby_orientationeZ	snapped_vZ	snapped_h r$   C/var/www/html/venv/lib/python3.10/site-packages/pdfplumber/table.py
snap_edges   s   
	r&   r   	tolerancec           	      C   s   |dkr	d\}}n|dkrd\}}nt dtt| t|d}|d g}|dd	 D ])}|d
 }|| || | krO|| || krNt|||| |d
< q+|| q+|S )z
    Given a list of edges along the same infinite line, join those that
    are within `tolerance` pixels of one another.
    r   )r    x1r   )r!   bottomzOrientation must be 'v' or 'h'keyr   r   N)
ValueErrorlistsortedr   r   Zresize_objectr"   )	r   r   r'   Zmin_propZmax_propZsorted_edgesZjoinedr#   lastr$   r$   r%   join_edge_group'   s   


r1   snap_x_tolerancesnap_y_tolerancejoin_x_tolerancejoin_y_tolerancec           	         sv   dt dtttf fdd}|dks|dkrt| ||} t| |d}tj||d} fdd|D }ttj	| } | S )	z|
    Using the `snap_edges` and `join_edge_group` methods above,
    merge a list of edges into a more "seamless" list.
    edger   c                 S   s$   | d dkrd| d fS d| d fS )Nr   r   r!   r   r    r$   )r6   r$   r$   r%   	get_groupP   s   zmerge_edges.<locals>.get_groupr   r*   c                 3   s4    | ]\}}t ||d  |d  dkr nV  qdS )r   r   N)r1   ).0kitemsr4   r5   r$   r%   	<genexpr>[   s    
zmerge_edges.<locals>.<genexpr>)
r   r
   strr   r&   r/   	itertoolsgroupbyr.   chain)	r   r2   r3   r4   r5   r7   _sortedZedge_groupsZedge_genr$   r;   r%   merge_edgesD   s   rB   wordsword_thresholdc           	   
      s   t | tdd}t fdd|}ttt j|}t|dkr"g S tttd|}t	ttd|}g }|D ]"}||||d |d || dd	|||d
 |d
 || dd	g7 }q8|S )zi
    Find (imaginary) horizontal lines that connect the tops
    of at least `word_threshold` words.
    r!   r   c                       t |  kS NlenxrD   r$   r%   <lambda>m       z"words_to_edges_h.<locals>.<lambda>r   r    r(   r   )r    r(   r!   r)   widthr   r)   )
r   cluster_objectsr   filterr.   mapZobjects_to_rectrH   minmax)	rC   rD   Zby_toplarge_clustersZrectsZmin_x0max_x1r   rr$   rK   r%   words_to_edges_he   s4   rW   c                    sF  t | tdd}t | tdd}dtdtfdd}t | |d}|| | }t|dd	 d
}tfdd	|}ttt j	|}	g }
|	D ] t
 fdd|
D }|sY|
  qEt|
dkrbg S tt j|
}tt|tdd
}tttd|}tttd|tttd|fdd|D || ddg S )zy
    Find (imaginary) vertical lines that connect the left, right, or
    center of at least `word_threshold` words.
    r    r   r(   wordr   c                 S   s   t | d | d  d S )Nr    r(      )float)rX   r$   r$   r%   
get_center   s   z$words_to_edges_v.<locals>.get_centerc                 S   s
   t |  S rF   rG   rI   r$   r$   r%   rL      s   
 z"words_to_edges_v.<locals>.<lambda>r*   c                    rE   rF   rG   rI   rK   r$   r%   rL      rM   c                 3   s    | ]	}t  |V  qd S rF   )r   Zget_bbox_overlapr8   cbboxr$   r%   r<      s    z#words_to_edges_v.<locals>.<genexpr>r   r!   r)   c              	      s*   g | ]}|d  |d     ddqS )r    r   r    r(   r!   r)   heightr   r$   )r8   b)
max_bottommin_topr$   r%   
<listcomp>   s    	z$words_to_edges_v.<locals>.<listcomp>r   r`   )r   rO   r   r   r   r/   rP   r.   rQ   Zobjects_to_bboxanyr"   rH   Zbbox_to_rectrS   rR   )rC   rD   Zby_x0Zby_x1r[   Z	by_centerZclustersZsorted_clustersrT   ZbboxesZcondensed_bboxesZoverlapZcondensed_rectsZsorted_rectsrU   r$   )r_   rc   rd   rD   r%   words_to_edges_v   sB   
	rg   c           	         s   i } fdddD \}}t |tdddD ][}t |tdddD ]O}|d |d | krp|d |d | krp|d |d | krp|d |d | krp|d |d f}||vr^g g d||< || d	 | || d
 | q!q|S )zi
    Given a list of edges, return the points at which they intersect
    within `tolerance` pixels.
    c                    s"   g | ] t t fd dqS )c                    s   | d  kS )Nr   r$   rI   or$   r%   rL      rM   z3edges_to_intersections.<locals>.<listcomp>.<lambda>)r.   rP   )r8   r   rh   r%   re      s    z*edges_to_intersections.<locals>.<listcomp>r   r    r!   r*   r)   r(   r   r   )r/   r   r"   )	r   r   r   intersectionsZv_edgesZh_edgesr   r   Zvertexr$   rj   r%   edges_to_intersections   s$   

rl   rk   c                    s   dt dt dtffdd tt tdtt  dtdtt	 f fdd	fd
dt
tD }ttd|S )a8  
    Given a list of points (`intersections`), return all rectangular "cells"
    that those points describe.

    `intersections` should be a dictionary with (x0, top) tuples as keys,
    and a list of edge objects as values. The edge objects should correspond
    to the edges that touch the intersection.
    p1p2r   c                    s   dt dtt fdd}| d |d kr*| |  d | | d }t|r*dS | d |d krI| |  d	 | | d	 }t|rIdS d
S )Nr   r   c                 S   s   t ttj| S rF   )setrQ   r   Zobj_to_bboxrj   r$   r$   r%   edges_to_set   s   zCintersections_to_cells.<locals>.edge_connects.<locals>.edges_to_setr   r   Tr   r   F)r   r	   r   intersectionrH   )rm   rn   rp   common)rk   r$   r%   edge_connects   s   z-intersections_to_cells.<locals>.edge_connectspointsic                    s   |d krd S | |  | |d d  } fdd|D } fdd|D }|D ];} |s0q(|D ]0} |s:q2|d |d f}|v rb||rb||rb d  d |d |d f    S q2q(d S )Nr   c                        g | ]}|d   d  kr|qS )r   r$   r8   rJ   ptr$   r%   re          zFintersections_to_cells.<locals>.find_smallest_cell.<locals>.<listcomp>c                    rv   r   r$   rw   rx   r$   r%   re     rz   r   r$   )rt   ru   restZbelowrightZbelow_ptZright_ptbottom_right)rs   rk   n_pointsrx   r%   find_smallest_cell
  s,   

$z2intersections_to_cells.<locals>.find_smallest_cellc                 3   s    | ]} |V  qd S rF   r$   )r8   ru   )r   rt   r$   r%   r<   %  s    z)intersections_to_cells.<locals>.<genexpr>N)r   boolr.   r/   keysrH   r   intr   r   rangerP   )rk   Zcell_genr$   )rs   r   rk   r   rt   r%   intersections_to_cells   s   
&r   cellsc                    s0  dt dtttttf fdd}t| }t  g }g }t|r|t|}t|D ]<}||}t|dkrC t|O  || || q&t fdd|D }|dkrb t|O  || || q&t||krx|t|  	  |	  t|st|r|t| t
|dd	 d
}	dd |	D }
|
S )z
    Given a list of bounding boxes (`cells`), return a list of tables that
    hold those cells most simply (and contiguously).
    r_   r   c                 S   s(   | \}}}}||f||f||f||ffS rF   r$   )r_   r    r!   r(   r)   r$   r$   r%   bbox_to_corners/  s   z(cells_to_tables.<locals>.bbox_to_cornersr   c                 3   s    | ]}| v V  qd S rF   r$   r\   Zcurrent_cornersr$   r%   r<   H  s    z"cells_to_tables.<locals>.<genexpr>c                 S   s   t dd | D S )Nc                 s   s     | ]}|d  |d fV  qdS )r   r   Nr$   r\   r$   r$   r%   r<   a  s    z4cells_to_tables.<locals>.<lambda>.<locals>.<genexpr>)rR   )tr$   r$   r%   rL   a  s    z!cells_to_tables.<locals>.<lambda>r*   c                 S   s   g | ]
}t |d kr|qS r{   rG   )r8   r   r$   r$   r%   re   b      z#cells_to_tables.<locals>.<listcomp>)r   r
   r   r.   ro   rH   r"   removesumclearr/   )r   r   Zremaining_cellsZcurrent_cellstablesZinitial_cell_countcellZcell_cornersZcorner_countrA   filteredr$   r   r%   cells_to_tables)  s:   


r   c                   @   s"   e Zd Zdeee  fddZdS )	CellGroupr   c              	   C   sh   || _ tttdtd |tttdtd |tttdtd |tttdtd |f| _d S Nr   r   rY   r   )r   rR   rQ   r   rP   rS   r_   )selfr   r$   r$   r%   __init__g  s   
zCellGroup.__init__N)__name__
__module____qualname__r   r   r   r   r$   r$   r$   r%   r   f  s    r   c                   @      e Zd ZdS )RowNr   r   r   r$   r$   r$   r%   r   q      r   c                   @   r   )ColumnNr   r$   r$   r$   r%   r   u  r   r   c                   @   s   e Zd Zdddee fddZedefddZd	ee	 dee	 fd
dZ
edee	 fddZedee	 fddZdedeeee   fddZdS )Tablepager   r   c                 C   s   || _ || _d S rF   )r   r   )r   r   r   r$   r$   r%   r   z  s   
zTable.__init__r   c                 C   sJ   | j }tttd|tttd|tttd|tttd|fS r   )r   rR   rQ   r   rS   )r   r]   r$   r$   r%   r_   ~  s   z
Table.bboxkindc           
         s   |t u rdnd t  }t| jt| d}ttttt | j}t	|t|}g }|D ]\}} fdd|D |fdd|D }	|
|	 q1|S )Nr   r   r*   c                    s   i | ]}|  |qS r$   r$   )r8   r   )axisr$   r%   
<dictcomp>      z+Table._get_rows_or_cols.<locals>.<dictcomp>c                    s   g | ]}  |qS r$   )getrw   )xdictr$   r%   re     r   z+Table._get_rows_or_cols.<locals>.<listcomp>)r   r   r/   r   r   r.   ro   rQ   r>   r?   r"   )
r   r   ZantiaxisrA   Zxsgroupedrowsy	row_cellsrowr$   )r   r   r%   _get_rows_or_cols  s   
zTable._get_rows_or_colsc                 C   
   |  tS rF   )r   r   r   r$   r$   r%   r        
z
Table.rowsc                 C   r   rF   )r   r   r   r$   r$   r%   columns  r   zTable.columnskwargsc                    s   | j j}g }dtdtdtfdd| jD ][g }fdd|D }jD ]D  d u r.d }n6 fdd|D }t|rbd	|v rX d
  d  |d<  d  d  |d<  |d< tj	|fi |}nd}|
| q%|
| q|S )Ncharr_   r   c                 S   sX   | d | d  d }| d | d  d }|\}}}}t ||ko*||k o*||ko*||k S )Nr!   r)   rY   r    r(   )r   )r   r_   Zv_midZh_midr    r!   r(   r)   r$   r$   r%   char_in_bbox  s   z#Table.extract.<locals>.char_in_bboxc                    s   g | ]
} |j r|qS r$   r^   r8   r   )r   r   r$   r%   re     r   z!Table.extract.<locals>.<listcomp>c                    s   g | ]	}| r|qS r$   r$   r   )r   r   r$   r%   re     s
    
ZlayoutrY   r   Zlayout_widthr   r   Zlayout_heightZlayout_bbox )r   charsr   r   r   r   r   rH   r   Zextract_textr"   )r   r   r   Z	table_arrZarrZ	row_charsZ	cell_textZ
cell_charsr$   )r   r   r   r%   extract  s,   

zTable.extractN)r   r   r   r   r   r   propertyr_   r   r   r   r   r   r   r   r=   r   r$   r$   r$   r%   r   y  s    	"r   )lineslines_stricttextexplicit)snap_tolerancer2   r3   join_tolerancer4   r5   edge_min_lengthmin_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerancec                   @   r   )
UnsetFloatNr   r$   r$   r$   r%   r     r   r   c                   @   s*  e Zd ZU dZeed< dZeed< dZee	e
eef   ed< dZee	e
eef   ed< eZeed< eZeed< eZeed	< eZeed
< eZeed< eZeed< dZeed< eZeed< eZeed< dZeed< eZeed< eZeed< dZ ee!ee"f  ed< dddZ#e$dee% dd fddZ&dS )r   r   vertical_strategyhorizontal_strategyNexplicit_vertical_linesexplicit_horizontal_linesr   r2   r3   r   r4   r5   r   r   r   r   r   r   r   text_settingsr   c                 C   s   t D ]}t| |p
ddk rtd| dqdD ]}t| |d }|tvr2t| ddt dq| jd	u r;i | _d
D ]}|| jvrN| jdd| j|< q=d| jv rX| jd= dD ]\}}t| |tu rnt| |t| | qZd	S )a  Clean up user-provided table settings.

        Validates that the table settings provided consists of acceptable values and
        returns a cleaned up version. The cleaned up version fills out the missing
        values with the default values in the provided settings.

        TODO: Can be further used to validate that the values are of the correct
            type. For example, raising a value error when a non-boolean input is
            provided for the key ``keep_blank_chars``.

        :param table_settings: User-provided table settings.
        :returns: A cleaned up version of the user-provided table settings.
        :raises ValueError: When an unrecognised key is provided.
        r   zTable setting 'z' cannot be negative)
horizontalvertical	_strategyz_strategy must be one of{,}N)r   r   r'   r   ))r2   r   )r3   r   )r4   r   )r5   r   )r   r   )r   r   )	NON_NEGATIVE_SETTINGSgetattrr-   TABLE_STRATEGIESjoinr   r   UNSETsetattr)r   Zsettingr   strategyattrfallbackr$   r$   r%   __post_init__  s4   


zTableSettings.__post_init__settingsc                 C   s   |d u r|  S t || r|S t |tr@i }i }| D ]\}}|d d dkr0|||dd  < q|||< q||d< | di |S td| )N   Ztext_r   zCannot resolve settings: r$   )
isinstancedictr:   r-   )clsr   Zcore_settingsr   r9   r   r$   r$   r%   resolve+  s   


zTableSettings.resolve)r   N)'r   r   r   r   r=   __annotations__r   r   r   r   r   r   r   r   DEFAULT_SNAP_TOLERANCEr   r   r2   r3   DEFAULT_JOIN_TOLERANCEr   r4   r5   r   DEFAULT_MIN_WORDS_VERTICALr   r   DEFAULT_MIN_WORDS_HORIZONTALr   r   r   r   r   r   r   r   classmethodT_table_settingsr   r$   r$   r$   r%   r     s*   
 
3c                   @   s6   e Zd ZdZddddee fddZdefd	d
ZdS )TableFindera0  
    Given a PDF page, find plausible table structures.

    Largely borrowed from Anssi Nurminen's master's thesis:
    http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

    ... and inspired by Tabula:
    https://github.com/tabulapdf/tabula-extractor/issues/16
    Nr   r   r   c                    s^   | _ t| _   _t j jj jj _	t
 j	 _ fddt jD  _d S )Nc                    s   g | ]}t  j|qS r$   )r   r   )r8   Z
cell_groupr   r$   r%   re   T  s    z(TableFinder.__init__.<locals>.<listcomp>)r   r   r   r   	get_edgesr   rl   r   r   rk   r   r   r   r   )r   r   r   r$   r   r%   r   J  s   

zTableFinder.__init__r   c              
   C   s  | j }dD ]'}t||d }|dkr,t|d| d }t|dk r,td| d| d	q|j}|j}|d
ks;|d
krG| jjdi |jpDi }g }|j	pMg D ]9}	t
|	trit|	D ]}
|
d dkrg||
 qZqN||	|	| jjd | jjd | jjd | jjd  dd qN|dkrt| jjd}n!|dkrtj| jjddd}n|d
krt||jd}n|dkrg }|| }g }|jpg D ]9}	t
|	trt|	D ]}
|
d dkr||
 qq|| jjd | jjd | jjd | jjd  |	|	dd q|dkr	t| jjd}n$|dkrtj| jjddd}n|d
kr&t||jd}n|dkr-g }|| }t|t| }t||j|j|j|jd}tj||jdS )N)r   r   r   r   Z	explicit__linesrY   zIf z"_strategy == 'explicit', explicit_zD_lines must be specified as a list/tuple of two or more floats/ints.r   r   r   r   r   r`   r   r   line)Z	edge_typerK   r   r   )r    r(   rN   r!   r)   r   )r2   r3   r4   r5   )Z
min_lengthr$   )r   r   rH   r-   r   r   r   Zextract_wordsr   r   r   r   r   Zobj_to_edgesr"   r_   Zfilter_edgesr   rg   r   r   rW   r   r.   rB   r2   r3   r4   r5   r   )r   r   r   r   r   Zv_stratZh_stratrC   Z
v_explicitZdescr#   Zv_baser   Z
h_explicitZh_baser   r   r$   r$   r%   r   X  s   











zTableFinder.get_edgesrF   )	r   r   r   __doc__r   r   r   r   r   r$   r$   r$   r%   r   ?  s    
r   )r   r   )6r>   dataclassesr   operatorr   typingr   r   r   r   r   r	   r
   r   r   r   r   Z_typingr   r   r   r   r   r   r   r   r   r   r=   ZT_intersectionsr   r   r   r&   r1   rB   r   rW   rg   rl   r   r   objectr   r   r   r   r   r   rZ   r   r   r   r   r$   r$   r$   r%   <module>   s    , 


"
,
@
?=SZ