
    2ki                     2   d dl Z  e j        d            e j        d           d dlZd dlZd dlmZmZ d dlZd dl	m
c mZ d dlZd dlZd dlZd dlmZ d dlmc mZ ej        j                                         d Zd
dZd Zed	k    r e             dS dS )    Nignore)urlparseurljoin)dequec                    t          d| v r| nd| z             }|j         d|j         d}	 t          j        |d          }|j        dk    r3t          d|            t          |j        dz   |j        z             }|S t          d|            t          j	        |j
                  }g }|                    d	          D ]N}|j        }|                    |           t          d
|            t          j                                         O|S # t"          $ rn}	t          d|	            	 t          |j        dz   |j        z             }|cY d}	~	S # t"          $ r#}
t          d|
            g cY d}
~
cY d}	~	S d}
~
ww xY wd}	~	ww xY w)zFetch and parse sitemap.xml.://http://z/sitemap.xml   )timeout   z[-] sitemap.xml not found at z[+] Found sitemap.xml at z3.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc  - z![-] Could not parse sitemap.xml: Nz[-] Sitemap generation failed: )r   schemenetlocrequestsgetstatus_codeprintgenerate_sitemapET
fromstringcontentfindalltextappendsysstdoutflush	Exception)urlparsedsitemap_urlr	generatedrooturlsurl_elemueexs              sitemap_parser.pyparse_sitemapr+      s   Uc\\ccy3??F]BBv}BBBKLa000=C?+??@@@()>)NOOI7+77888}QY''%Z[[ 	 	HAKKNNN***J   5!55666	()>)NOOI 	 	 	8B88999IIIIIIIIIII	sU   AD  BD 
FF0 EF
F E>3F4F8F>FFFr   c                 "   t          d| v r| nd| z             }|j         d|j         }t                      }g }t	          |g          }t          j        dt
          j                  }t          d           t          d           t          j
                                         |rt          |          |k     r|                                }||v r0|                    |           	 t          j        |dd          }	|	j        d	k    ri|                    |           t          d
|            t          d| d           t          j
                                         	 |	j                            dd          }
n# t*          $ r
 |	j        }
Y nw xY w|                    |
          D ]h}|                                }|                    d          sN|                                                    d          s'|                                                    d          r{|                    d          r|j        dz   |z   }n=|                    d          s|                    d          r|}nt7          ||          }	 t          |          }|j        |j        k    r|                    d          d         }||vr<||vr8t          |          t          |          z   |k     r|                    |           Y# t*          $ r Y fw xY wn# t*          $ r Y w xY w|rt          |          |k     t          d           t          dt          |           d           t          j
                                         |S )aC  Simple site crawler to generate a sitemap.xml file for the given base URL.

    This is a best-effort generator that follows internal links and writes
    a sitemap XML file to ./generated_sitemaps/<host>.xml. It does not
    execute javascript and is intended as a helpful fallback when no
    sitemap.xml is present.
    r   r	   zhref=["\']([^"\']+)["\']z<?xml version="1.0" ?>z<<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">   T)r   allow_redirectsr   r   z  <url>
    <loc>z</loc>
  </url>zutf-8replace)errors#zmailto:zjavascript:z//:zhttps://r   z	</urlset>z[+] Generated sitemap with z URL(s))r   r   r   setr   recompile
IGNORECASEr   r   r   r   lenpopleftaddr   r   r   r   r   decoder   r   r   strip
startswithlowerr   split)base_urlmax_urlsr    baseseenr%   qlink_recurr"   htmlmhref	candidatecand_parseds                  r*   r   r   3   s    %8"3"3hhX9MNNFm////D55DDtfAj4bmDDG 

"###	
HIIIJ
 )D		H$$iikk$;;%	S!TBBBA}##KK,,,<s<<<===Jy''	'BB   v__T**  wwyy??3'' 4::<<+B+B9+M+M QUQ[Q[Q]Q]QhQhivQwQw ??4(( 3 & 3d :II__Y// 34??:3N3N 3 $II 'T 2 2I	"*9"5"5K")V]::  ) 4 4Q 7I ,,!1C1CD		TWXYTZTZHZ]eHeHe+++    H%(  	 	 	H	Q  )D		H$$X 
+	
:D		
:
:
:;;;JKso   +"L AL 'F L FL FC3L L	+L ,AL	L 	
LL LL 
L)(L)c                     t          j        d          } |                     dd           |                                 }d|j        v r|j        n	d|j        z   }t          |          }|rwt          dt          |           d	           |d d
         D ]}t          d|            t          |          d
k    r't          dt          |          d
z
   d           d S d S d S )Nz/Parse robots.txt and sitemap.xml for endpoints.)descriptionr   z(Target URL or domain (e.g., example.com))helpr   r	   z
[+] Found z URL(s) in sitemap.xml:   r   z
  ... and z more)argparseArgumentParseradd_argument
parse_argsr   r+   r   r7   )parserargstargetr%   r'   s        r*   mainrV   y   s   $1bcccF
$NOOOD$(**TXX	DH0DF  D 4=3t99===>>>crc 	 	A***t99r>>2s4yy|222333334 4 >    __main__)r   )warningsfilterwarningssimplefilterr   rO   urllib.parser   r   r   xml.etree.ElementTreeetreeElementTreer   osr4   tempfilecollectionsr   xml.dom.minidomdomminidompackagesurllib3disable_warningsr+   r   rV   __name__ rW   r*   <module>rk      s>      ! ! !  h      * * * * * * * * 



 " " " " " " " " " 				 				        ! ! ! ! ! ! ! ! !    * * , , ,  BD D D DL4 4 4 zDFFFFF rW   