o
    he                  	   @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZm Z  d d	l!m"Z" d d
l#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl&m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= e>dZ?e@g dZAerddlBmCZC ddlDmEZE ddddddZFdeGdeHfd d!ZId"eed#f deeeeeJeKf d#f  eeH f fd$d%ZLd"edeeeeeJeKf d#f  eeH f fd&d'ZMd(eeHef deeHef fd)d*ZNG d+d, d,eZOd@d-e(d.e)de(fd/d0ZPd-e(d1e)de(fd2d3ZQG d4d5 d5e-ZRG d6d7 d7eRZSd8e(d9e(ddfd:d;ZTG d<d= d=eSZUG d>d? d?eSZVdS )A    N)	lru_cache)
TYPE_CHECKINGAnyCallableDict	GeneratorListOptionalPatternTupleUnion)	normalize)warn)PDFPageAggregator)LTCharLTComponentLTContainerLTCurveLTItemLTPageLTTextContainer)PDFPageInterpreter	PDFStackT)PDFPage)	PSLiteral   )utils)T_bboxT_numT_obj
T_obj_list)	Container)PDFStructTreeStructTreeMissing)T_table_settingsTableTableFinderTableSettings)decode_textresolve_allresolve_and_decode)MalformedPDFExceptionPdfminerException)TextMapz^LT)advheightZ	linewidthptssizeZsrcsizewidthx0x1y0y1bitsmatrixZuprightfontnametextZ	imagemaskZ
colorspaceZevenoddfillnon_stroking_colorstrokestroking_colorstreamnamemcidtag)	PageImage)PDFzSimSun,RegularzSimHei,RegularzSimKai,RegularzSimFang,RegularzSimLi,Regular)s   s   s   _GB2312s   _GB2312s   r9   returnc                 C   sh   d| v r|  dd }| d | | |d  }}nd| }}t|t|dd }t|dd | S )N   +r          )indexCP936_FONTNAMESgetstr)r9   Zsplit_atprefixsuffixZ
suffix_new rP   B/var/www/html/venv/lib/python3.10/site-packages/pdfplumber/page.pyfix_fontname_bytes\   s   
rR   color.c                 C   s4   t | d tr| d d pd t| d jfS | d fS )NrI   )
isinstancer   r(   r@   )rS   rP   rP   rQ   separate_patterng   s   rU   c                 C   sJ   | d u rdS t | tr| }t|S t | trt| }t|S | f}t|S )N)NN)rT   tuplelistrU   )rS   Z	tuplefiedrP   rP   rQ   normalize_colorp   s   

rX   kwargsc                 C   s   dd |   D S )Nc                 S   s(   i | ]\}}|t |trt|n|qS rP   )rT   rW   rV   ).0keyvaluerP   rP   rQ   
<dictcomp>   s    z'tuplify_list_kwargs.<locals>.<dictcomp>)items)rY   rP   rP   rQ   tuplify_list_kwargs~   s   r_   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dde
dee ddfdd	Zdd
dZdddZdef fddZd fddZd fddZ  ZS )"PDFPageAggregatorWithMarkedContentzZExtract layout from a specific page, adding marked-content IDs to
    objects where found.Ncur_mcidcur_tagrB   propsrE   c                 C   s6   t |j| _t|trd|v r|d | _dS d| _dS )z5Handle beginning of tag, setting current MCID if any.ZMCIDN)r(   r@   rb   rT   dictra   )selfrB   rc   rP   rP   rQ   	begin_tag   s   
z,PDFPageAggregatorWithMarkedContent.begin_tagc                 C   s   d| _ d| _dS )z/Handle beginning of tag, clearing current MCID.N)rb   ra   re   rP   rP   rQ   end_tag   s   
z*PDFPageAggregatorWithMarkedContent.end_tagc                 C   s,   | j jr| j jd }| j|_| j|_dS dS )z^Add current MCID to what we hope to be the most recent object created
        by pdfminer.six.rI   N)Zcur_item_objsra   rA   rb   rB   )re   Zcur_objrP   rP   rQ   tag_cur_item   s
   	z/PDFPageAggregatorWithMarkedContent.tag_cur_itemc                    s   t  j|i |}|   |S )z;Hook for rendering characters, adding the `mcid` attribute.)superrender_charrj   )re   argsrY   r.   	__class__rP   rQ   rl      s   z.PDFPageAggregatorWithMarkedContent.render_charc                       t  j|i | |   dS )z7Hook for rendering images, adding the `mcid` attribute.N)rk   render_imagerj   re   rm   rY   rn   rP   rQ   rq         z/PDFPageAggregatorWithMarkedContent.render_imagec                    rp   )zAHook for rendering lines and curves, adding the `mcid` attribute.N)rk   
paint_pathrj   rr   rn   rP   rQ   rt      rs   z-PDFPageAggregatorWithMarkedContent.paint_pathNrE   N)__name__
__module____qualname____doc__ra   r	   int__annotations__rb   rM   r   r   rf   rh   rj   floatrl   rq   rt   __classcell__rP   rP   rn   rQ   r`      s   
 

r`   box_rawrotationc                 C   sp   t dd | D std|  t| d | d f\}}t| d | d f\}}|dv r2||||fS ||||fS )	Nc                 s   s    | ]	}t |tjV  qd S ru   )rT   numbersNumberrZ   xrP   rP   rQ   	<genexpr>   s    z!_normalize_box.<locals>.<genexpr>z0Bounding box contains non-number coordinate(s): r   rH   r      )Z   i  )allr+   sorted)r   r   r3   r4   r5   r6   rP   rP   rQ   _normalize_box   s   r   	mb_heightc                 C   s    | \}}}}||| ||| fS ru   rP   )r   r   r3   r5   r4   r6   rP   rP   rQ   _invert_box   s   r   c                   @   s  e Zd ZU ejdg Zee ed< dZe	ed< dZ
	dgddd	ed
edefddZdhddZedefddZedefddZedeeeef  fddZedefddZedefddZedefddZedeeef fddZdeeef deeef fd d!Zd"edefd#d$Z d%ee! de"eddf fd&d'Z#deeef fd(d)Z$	did*e%e& de'fd+d,Z(	did*e%e& dee) fd-d.Z*	did*e%e& de%e) fd/d0Z+	did*e%e& deeee%e    fd1d2Z,	did*e%e& de%eee%e    fd3d4Z-d5ede.fd6d7Z/					djd8e0ee1e f d9e	d:e	d;ed<e	d=e	d5edeeeef  fd>d?Z2d5edefd@dAZ3d5edefdBdCZ4d5edefdDdEZ5	dkdFe	d<e	d5edefdGdHZ6	dldJe7dKe	dLe	ddMfdNdOZ8	dldJe7dKe	dLe	ddMfdPdQZ9	dldJe7dKe	dLe	ddMfdRdSZ:dTe;ege	f ddUfdVdWZ<d5eddUfdXdYZ=				I	IdmdZe%e0ee>f  d[e%e0ee>f  d\e%e0ee>f  d]e	d^e	dd_fd`daZ?didbe%ee  deeef fdcddZ@defdedfZAdS )nPage_layoutcached_propertiesTis_originalNr   pdfrD   page_objpage_numberinitial_doctopc           	         s   || _ | | _ | _|| _|| _ddtdtdtf fdd}|dd}|d | _t|d	| j}|d
 |d  }t	||| _
d jv rOt	t|d| j|| _n| j
| _| j
| _t | j| _d S )Nr[   defaultrE   c                    s    t  j| }|d u r|S |S ru   )r)   attrsrL   )r[   r   r\   r   rP   rQ   get_attr   s   zPage.__init__.<locals>.get_attrZRotater   ih  ZMediaBoxr   r   ZCropBoxru   )r   	root_pager   r   r   rM   r   r   r   r   mediaboxr   cropboxbboxr   _get_textmapget_textmap)	re   r   r   r   r   r   Z	_rotationZmb_rawr   rP   r   rQ   __init__   s$   


zPage.__init__rE   c                 C   s   |    | j  d S ru   )flush_cacher   cache_clearrg   rP   rP   rQ   close   s   z
Page.closec                 C      | j d | j d  S )NrH   r   r   rg   rP   rP   rQ   r2        z
Page.widthc                 C   r   )Nr   r   r   rg   rP   rP   rQ   r/     r   zPage.heightc                 C   s0   zdd t | j| D W S  ty   g  Y S w )z-Return the structure tree for a page, if any.c                 S   s   g | ]}|  qS rP   )to_dict)rZ   elemrP   rP   rQ   
<listcomp>  s    z'Page.structure_tree.<locals>.<listcomp>)r"   r   r#   rg   rP   rP   rQ   structure_tree  s
   zPage.structure_treec              
   C   sx   t | dr| jS t| jj| j| jjd}t| jj|}z|| j	 W n t
y3 } zt|d }~ww | | _| jS )Nr   )Zpagenolaparams)hasattrr   r`   r   Zrsrcmgrr   r   r   Zprocess_pager   	Exceptionr,   Z
get_result)re   ZdeviceinterpretererP   rP   rQ   layout  s    

zPage.layoutc                    sx   dt ttf dtdt ttf ffdd dtdtf fdd}tjjp(g }tt||}t	t
r:|S |S )	NptrrE   c                    sF   |d }t |D ]}| \}}||d kr jn j}||| f} q| S )Nr   rH   )ranger2   r/   )r   r   Zturnsir   ycomprg   rP   rQ   rotate_point&  s   z!Page.annots.<locals>.rotate_pointannotc                    sb  | d \}}}} ||fj } ||fj }jj}ttg ||R |\}}	}
}| di }|d| d| dd}| D ]>\}}|d urz	|d||< W qE ty   z	|d||< W n ty   j	j
rr td	| d
| d Y nw Y qEw qEjd||| |
||	 j|	 |	||
| ||	 d}|| d| v r| d< | |d< |S )NZRectAURITZContents)urititlecontentszutf-8zutf-16zCould not decode z of annotation. z will be missing.r   )r   object_typer3   r5   r4   r6   doctoptopbottomr2   r/   Pdata)r   r   r/   r   r   rL   r^   decodeUnicodeDecodeErrorr   Zraise_unicode_errorsr   r   r   update)r   Z_a_bZ_cZ_dZpt0Zpt1Zrhr3   r   r4   r   aextraskvparsedr   re   rP   rQ   parse.  s\    
zPage.annots.<locals>.parse)r   r}   r{   r   r)   r   annotsrW   maprT   CroppedPage_crop_fn)re   r   rawr   rP   r   rQ   r   $  s   *1

zPage.annotsc                 C   s   dd | j D S )Nc                 S   s   g | ]
}|d  dur|qS )r   NrP   )rZ   r   rP   rP   rQ   r   h  s    z#Page.hyperlinks.<locals>.<listcomp>)r   rg   rP   rP   rQ   
hyperlinksf  s   zPage.hyperlinksc                 C   s    t | dr| jS |  | _| jS )N_objects)r   r   parse_objectsrg   rP   rP   rQ   objectsj  s   

zPage.objectsr   c                 C   s*   | j d |d  | j d | j |d  fS )Nr   r   )r   r/   )re   r   rP   rP   rQ   point2coordq  s   *zPage.point2coordobjc                    s(  t td|jj }dtttf dt	tttf  fdd}t
td t||j }||d<  j|d< dD ]}t||rGtt||j||< q6d	D ]\}}||v r^t|| \||< ||< qJt|ttfr{| } jjd urwt jj|n||d
< t|tr|j}	t|	j\|d< |d< t|	j\|d< |d< t|d trt |d |d< n#t|t!frt"t j#|d |d<  fdd|j$D |d< |j%|d<  j&d d \}
}d|v r j'|d  | |d<  j'|d  | |d<  j(|d  |d< d|v r|
dkr|d |
 |d< |d |
 |d< |S )N itemrE   c                 S   s$   | \}}|t v rt|}||fS d S ru   )	ALL_ATTRSr)   )r   r   r   resrP   rP   rQ   process_attrx  s
   z)Page.process_object.<locals>.process_attrr   r   )ncsZscs))r>   stroking_pattern)r<   non_stroking_patternr:   r>   r   r<   r   r9   r0   c                    s$   g | ]^}}|gt  j|R qS rP   )r   r   )rZ   cmdr0   rg   rP   rQ   r     s   $ z'Page.process_object.<locals>.<listcomp>pathdashrH   r5   r6   r   r   r   r3   r   r4   ))resublt_patro   rw   lowerr   rM   r   r	   rd   filterr   __dict__r^   r   r   r*   getattrr@   rX   rT   r   r   Zget_textr   Zunicode_normnormalize_unicodeZgraphicstateZscolorZncolorbytesrR   r   rW   r   Zoriginal_pathZdashing_styler   r/   r   )re   r   kindr   attrcsZ
color_attrZpattern_attrr:   gsZmb_x0Zmb_toprP   rg   rQ   process_objectu  sV   &



zPage.process_objectlayout_objectsc                 c   sR    |D ]#}t |tr | jjd ur| |V  | |jE d H  q| |V  qd S ru   )rT   r   r   r   r   iter_layout_objectsri   )re   r   r   rP   rP   rQ   r     s   
zPage.iter_layout_objectsc                 C   sR   i }|  | jjD ]}|d }|dv rq	||d u rg ||< || | q	|S )Nr   )anno)r   r   ri   rL   append)re   r   r   r   rP   rP   rQ   r     s   zPage.parse_objectstable_settingsc                 C   s   t |}t| |S ru   )r'   resolver&   re   r   tsetrP   rP   rQ   debug_tablefinder  s   

zPage.debug_tablefinderc                 C   s   t |}t| |jS ru   )r'   r   r&   tablesr   rP   rP   rQ   find_tables  s   
zPage.find_tablesc                 C   sX   t |}| |}t|dkrd S dtdttttf fdd}tt	||dd }|S )Nr   r   rE   c                 S   s   t | j | jd | jd fS )Nr   r   )lencellsr   r   rP   rP   rQ   sorter  s   zPage.find_table.<locals>.sorter)r[   )
r'   r   r   r   r%   r   r{   r   rW   r   )re   r   r   r   r  ZlargestrP   rP   rQ   
find_table  s   

zPage.find_tablec                    s&   t | |  } fdd|D S )Nc                    s"   g | ]}|j d i  jpi qS )rP   )extracttext_settings)rZ   tabler   rP   rQ   r     s   " z'Page.extract_tables.<locals>.<listcomp>)r'   r   r   )re   r   r   rP   r  rQ   extract_tables  s   

zPage.extract_tablesc                 C   s6   t |}| |}|d u rd S |jdi |jpi S NrP   )r'   r   r  r  r  )re   r   r   r  rP   rP   rQ   extract_table  s
   

zPage.extract_tablerY   c                 K   s\   t | jd}d|vr|d| ji d|vr|d| ji i ||}tj| jfi |S )N)Zlayout_bboxZlayout_width_charsZlayout_widthZlayout_height_charsZlayout_height)rd   r   r   r2   r/   r   Zchars_to_textmapchars)re   rY   defaultsZfull_kwargsrP   rP   rQ   r     s   zPage._get_textmappatternregexcase
main_groupreturn_charsreturn_groupsc           	      K   s*   | j di t|}|j||||||dS )N)r  r  r  r  r  rP   )r   r_   search)	re   r  r  r  r  r  r  rY   ZtextmaprP   rP   rQ   r    s   
zPage.searchc                 K   s   | j di t|jS r  )r   r_   	as_stringre   rY   rP   rP   rQ   extract_text,  s   zPage.extract_textc                 K      t j| jfi |S ru   )r   extract_text_simpler
  r  rP   rP   rQ   r  /     zPage.extract_text_simplec                 K   r  ru   )r   extract_wordsr
  r  rP   rP   rQ   r  2  r  zPage.extract_wordsstripc                 K   s   | j di t|j||dS )N)r  r  rP   )r   r_   extract_text_lines)re   r  r  rY   rP   rP   rQ   r  5  s   zPage.extract_text_linesFr   relativestrictr   c                 C   s   t | |||dS )N)r  r  )r   re   r   r  r  rP   rP   rQ   crop<  s   z	Page.cropc                 C      t | |||tjdS zS
        Same as .crop, except only includes objects fully within the bbox
        )r  r  crop_fn)r   r   within_bboxr  rP   rP   rQ   r#  A     zPage.within_bboxc                 C   r   r!  )r   r   outside_bboxr  rP   rP   rQ   r%  K  r$  zPage.outside_bboxtest_functionFilteredPagec                 C   s
   t | |S ru   )r'  )re   r&  rP   rP   rQ   r   U     
zPage.filterc                 K   sB   t | dd }dd | j D |_tj| jfi ||jd< |S )u   
        Removes duplicate chars — those sharing the same text and positioning
        (within `tolerance`) as other characters in the set. Adjust extra_args
        to be more/less restrictive with the properties checked.
        c                 S   s   dS )NTrP   r   rP   rP   rQ   <lambda>^  s    z#Page.dedupe_chars.<locals>.<lambda>c                 S   s   i | ]\}}||qS rP   rP   )rZ   r   objsrP   rP   rQ   r]   _  s    z%Page.dedupe_chars.<locals>.<dictcomp>char)r'  r   r^   r   r   dedupe_charsr
  )re   rY   prP   rP   rQ   r,  X  s   zPage.dedupe_chars
resolutionr2   r/   	antialiasforce_mediaboxrC   c           	      C   s   ddl m}m} tdd |||fD }|dkrtd| |dur+d| | j }n|dur6d| | j }|| |p;|||dS )	z
        You can pass a maximum of 1 of the following:
        - resolution: The desired number pixels per inch. Defaults to 72.
        - width: The desired image width in pixels.
        - height: The desired image width in pixels.
        r   )DEFAULT_RESOLUTIONrC   c                 s   s    | ]}|d uV  qd S ru   rP   r   rP   rP   rQ   r   s  s    z Page.to_image.<locals>.<genexpr>zUOnly one of these arguments can be provided: resolution, width, height. You provided NH   )r.  r/  r0  )displayr1  rC   sum
ValueErrorr2   r/   )	re   r.  r2   r/   r/  r0  r1  rC   Z	num_specsrP   rP   rQ   to_imagec  s    zPage.to_imageobject_typesc              	   C   sl   |d u rt | j dg }n|}| j| j| j| j| j| j| j	| j
d}|D ]}t| |d ||d < q&|S )Nr   )r   r   r   r   r   r   r2   r/   s)rW   r   keysr   r   r   r   r   r   r2   r/   r   )re   r7  Z_object_typesdtrP   rP   rQ   r     s   
zPage.to_dictc                 C   s   d| j  dS )Nz<Page:>)r   rg   rP   rP   rQ   __repr__  s   zPage.__repr__r   rv   ru   )TTr   TT)TT)FT)NNNFF)Brw   rx   ry   r!   r   r   rM   r|   r   boolZpagesr   r{   r   r   r   propertyr2   r/   r   r   r   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   r	   r$   r&   r   r%   r   r  r  r	  r-   r   r   r
   r  r  r  r  r  r   r  r#  r%  r   r   r,  r}   r6  r   r=  rP   rP   rP   rQ   r      s6  
 

)A"Q






	






$!r   c                   @   s(   e Zd ZU dZeed< defddZdS )DerivedPageFr   parent_pagec                 C   sd   || _ |j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| 	t
j t | j| _d S ru   )rB  r   r   r   r   r   r   r   r   r   r!   r   r   r   r   )re   rB  rP   rP   rQ   r     s   zDerivedPage.__init__N)rw   rx   ry   r   r?  r|   r   r   rP   rP   rP   rQ   rA    s   
 rA  r   parent_bboxc                 C   st   t | }|dkrtd|  dt | |}|d u r%td|  d| t |}||k r8td|  d| d S )Nr   zBounding box z has an area of zero.z. is entirely outside parent page bounding box z. is not fully within parent page bounding box )r   Zcalculate_arear5  Zget_bbox_overlap)r   rC  Z	bbox_areaZoverlapZoverlap_arearP   rP   rQ   test_proposed_bbox  s$   

rD  c                       sb   e Zd Zejddfdededeeegef de	de	f
 fdd	Z
ed
eeef fddZ  ZS )r   FTrB  	crop_bboxr"  r  r  c                    s   |r|j \}}}} \}	}
}}|	| |
| || || f |r%t |j  dtdtf fdd}t | || _tju rE|j | _ d S  | _ d S )Nr*  rE   c                    s
   |  S ru   rP   )r*  rE  r"  rP   rQ   r     r(  z&CroppedPage.__init__.<locals>._crop_fn)r   rD  r    rk   r   r   r   r%  )re   rB  rE  r"  r  r  Zo_x0Zo_top_r3   r   r4   r   r   rn   rF  rQ   r     s   

zCroppedPage.__init__rE   c                    2   t  dr jS  fdd jj D  _ jS )Nr   c                    s   i | ]
\}}|  |qS rP   )r   rZ   r   r   rg   rP   rQ   r]     s    z'CroppedPage.objects.<locals>.<dictcomp>r   r   rB  r   r^   rg   rP   rg   rQ   r     s   


zCroppedPage.objects)rw   rx   ry   r   Zcrop_to_bboxr   r   r   r    r?  r   r@  r   rM   r   r~   rP   rP   rn   rQ   r     s"     r   c                       sJ   e Zd Zdedeegef f fddZede	e
ef fddZ  ZS )r'  rB  	filter_fnc                    s   |j | _ || _t | d S ru   )r   rK  rk   r   )re   rB  rK  rn   rP   rQ   r     s   zFilteredPage.__init__rE   c                    rH  )Nr   c                    s"   i | ]\}}|t t j|qS rP   )rW   r   rK  rI  rg   rP   rQ   r]     s    z(FilteredPage.objects.<locals>.<dictcomp>rJ  rg   rP   rg   rQ   r     s   


zFilteredPage.objects)rw   rx   ry   r   r   r   r?  r   r@  r   rM   r    r   r~   rP   rP   rn   rQ   r'    s      r'  r>  )Wr   r   	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   unicodedatar   r   warningsr   Zpdfminer.converterr   Zpdfminer.layoutr   r   r   r   r   r   r   Zpdfminer.pdfinterpr   r   Zpdfminer.pdfpager   Zpdfminer.psparserr   r   r   Z_typingr   r   r   r    	containerr!   Z	structurer"   r#   r  r$   r%   r&   r'   r(   r)   r*   Zutils.exceptionsr+   r,   Z
utils.textr-   compiler   setr   r3  rC   r   rD   rK   r   rM   rR   r}   r{   rU   rX   r_   r`   r   r   r   rA  rD  r   r'  rP   rP   rP   rQ   <module>   sn    0$	
!	
"
	"
"3   M(