
    e'iL:              
          d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ dZe dZe d	Ze d
ZdZdZdZd Zd0dZd Zd1dZd0dZd ZedfdZd Zedk    r	  e            Z g Z! ej"                    Z# e            Z$e$%                    d           e$%                    d           e$%                    d           e$%                    d            ej&        e$          Z'	 e D ]Z(dZ) e*e(e+          r8e(,                    d          se(,                    d           r ee(de#!          Z)e)s ee'e(          Z)e)r` e*e(e+          rTd"e(v r4d#e(v r0e(,                    d"          e)d"<   e(,                    d#          e)d#<   d$e(v re(,                    d$          e)d$<   e)re!-                    e)           	 e'.                                 n# e'.                                 w xY w e            Z/e/D ]Z0 ee0,                    d%                    Z1e!D ]Z(e(,                    d"          Z2e(,                    d#          Z3e2e31 ee0d"         e0d#          e4e2           e4e3                    Z5e5e1k    re(,                    d&          pe(,                    d'          e(,                    d(           e6e5d)          d* e(,                    d+g           D             dd,         d-Z7e0d.         -                    e7            ee!e//            e j8        e           dS )2    N)BeautifulSoup)urljoin)	webdriver)Options)Byz/home/asher/public_htmlz/news/sources.jsonz
/pulse.txtz/news/pulse.jsoni  zKhttps://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_month.geojsonzIhttps://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_day.geojsonc                     t           j                            t                    rl	 t	          t          d          5 } t          j        |           cddd           S # 1 swxY w Y   n)# t          $ r}t          d|            Y d}~nd}~ww xY w	 t	          t          d          5 }d |D             }d |D             cddd           S # 1 swxY w Y   dS # t          $ r dddd	d
dgcY S w xY w)zYReturn list of sources. Prefer structured JSON config, fall back to pulse.txt or default.rNzError loading sources.json: c                 ^    g | ]*}|                                 |                                 +S  strip).0lines     %/home/asher/public_html/news/pulse.py
<listcomp>z read_sources.<locals>.<listcomp>&   s-    @@@d4::<<@TZZ\\@@@    c                     g | ]}||d S ))nameurlr   )r   ls     r   r   z read_sources.<locals>.<listcomp>(   s     999aQq))999r   zBBC Newszhttps://www.bbc.com/newsz$http://feeds.bbci.co.uk/news/rss.xmlg{I@g6[)r   r   rsslatlon)ospathexistsSOURCES_FILEopenjsonload	Exceptionprint
INPUT_FILEFileNotFoundError)fhefliness       r   read_sourcesr)      s    
w~~l## 6	6lC(( %By}}% % % % % % % % % % % % % % % % % 	6 	6 	644455555555	6X*c"" 	:a@@a@@@E995999	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	:  X X X#,FOu  F  OV  W  W  X  	X  	X  	XXsp   A( AA( A  A( #A $A( (
B2B		BC 'C?C CC CC C/.C/c                    	 |pt          j                    }|                    | d          }|                                 t	          |j        d          }|                    dd          p|                    dddi          }|r+|                    d	          rt          | |d	                   S |                    ddd
i          }|r+|                    d	          rt          | |d	                   S |                    d          }|r+|                    d          rt          | |d                   S n# t          $ r Y dS w xY wdS )zqTry to fetch best image from a given article or page by checking og:image, twitter:image and first image element.   timeouthtml.parsermetaog:imagepropertyr   )attrscontentztwitter:imageimgsrcN)	requestsSessiongetraise_for_statusr   textfindr   r!   )r   sessionsr	   soupogtr5   s           r   fetch_image_from_pagerB   -   sa   )x'))EE#qE!!	QV]33YYv
Y33dtyyPVXbOcy7d7d 	/"&&## 	/39...IIfV_$=I>> 	.y!! 	.3)---ii 	,3775>> 	,3E
+++   tt4s    B1D= 4AD= :AD= =
E
Ec                 ^   t          |t                    r|                    d          n|}t          |t                    r|                    d          n|}t          d|            	 |                     |           | j                            d          d                             d          d                             d          d                                         p|}|                     t          j	        d          }g }|d	d
         D ]^}|j
                                        }|rAt          |          dk    r.||vr*|                    |           t          |          dk    r n_||d |D             dS # t          $ r }t          d| d|            Y d	}~d	S d	}~ww xY w)zZScrape a generic website for headlines. entry can be a string URL or a dict with url/name.r   r   zScanning frequency: |r   :-z
h1, h2, h3N         c                     g | ]}d |iS titler   r   hs     r   r   z$scrape_universal.<locals>.<listcomp>\   s    >_>_>_PQ|>_>_>_r   sourcer   	headlinesz
Static on : )
isinstancedictr9   r"   rL   splitr   find_elementsr   CSS_SELECTORr;   lenappendr!   )	driverentryr   r   	site_nameelementsrQ   r&   r;   s	            r   scrape_universalr^   E   s   (55
@%))E


5C *5$ 7 7@599VSD	
&
&
&'''

3L&&s++A.44S99!<BB3GGJPPRRZVZ	 ''FF	#2# 	 	A6<<>>D D		B4y+@+@  &&&y>>Q&&E $C>_>_U^>_>_>_```   %3%%!%%&&&ttttts   ,DF 
F,F''F,rI   c                 	   |pt          j                    }|                     d          p|                     d          }||                     d          g d}|                     d          p|                     d          }	 |rDt          d| d|            t	          j        |          }|j        d|         }|sqt          d	| d
           	 |                     d          }	|                    |	d          }
|
                                 t          |
j	        d          }g }dD ]*}|
                    |                    |                     +g }t                      }|D ]}t          |          |k    r nv|                    d          }|rt          |          dk     s||v rI|                    |           d}|                    d          p|                    d          }|r8|                    d          r#t%          |	|                    d                    }d}|                    d          p|                    dd          }|r~d}t)          |dd          r dD ]}|                    |          }|r|} nt+          |t,          t.          f          r
|r|d         }t+          |t0                    rt%          |	|          }|                    ||p|	|d           |r||d<   |S n,# t4          $ r}t          d| d|            Y d}~nd}~ww xY wdS |D ]}|                    d          }|                    d           }d}d!|v r'|j        r |j        d                             d          }|s+d"|v r'|j        r |j        d                             d          }|sd#|v r|j        D ]}|                    d$          }|                    d%          }d&}t+          |t0                    r|                    d'          rd}nOt+          |t,          t.          f          r3|D ]0} t+          | t0                    r|                     d'          rd} n1|d(k    r|r|                    d          } n|s|rt?          ||)          }|d                             |||d           |S n,# t4          $ r}t          d*| d|            Y d}~nd}~ww xY wdS )+zTry to parse RSS/Atom feed using feedparser or fall back to scraping.
    Returns dict {source: name, url, lat, lon, headlines: [{title, link, image}], }
    r   r   rO   r   feedzParsing feed for z -> Nz%Feed parsed but no entries found for z!, attempting page scrape fallbackr+   r,   r.   )h1h2h3Tr   rH   ahrefr5   r/   r0   r1   r9   )r6   zdata-srcr4   r   )rL   linkimagerQ   z Fallback page scrape failed for rR   rL   rf   media_contentmedia_thumbnaillinksreltypeFrg   	enclosure)r=   zFeed parse error for ) r7   r8   r9   r"   
feedparserparseentriesr:   r   r;   extendfind_allsetrX   get_textaddr<   find_parentr   	find_nextgetattrrS   listtuplestrrY   r!   rh   ri   rj   
startswithrB   )!rP   	max_itemsr=   r>   r\   outrss_urlparsedrp   page_urlsrespr?   elemstagrQ   seenr&   r;   rf   rd   r5   img_tagsrc_valkeyvalr[   rL   rg   r   rk   type_valis_imagetvs!                                    r   parse_feed_for_sourcer   c   s    	%8#%%A

6""7fjj&7&7Ivzz%'8'8r
J
JCjj56::f#5#5Gh8 b	>i>>W>>???%g..FnZiZ0G  8jijjjkkk4O%zz%00HEE(AE66E**,,,(]CCD E/ 9 9T]]3%7%78888 "I55D"  b  by>>Y66!E zzz55# %s4yy2~~$  $FF3KK=1==+=+= Dv D#*8QUU6]]#C#CD #"#++e"4"4"^		&S]	8^8^" A&*G&wt<< .+I !. !.C*1++c*:*:C'* %.25(-%.  *'D%=AA 5g 5*1!*)'377 A&-h&@&@!((4AQ\_)`)`aaaa  #+4K("
# ! O O OMYMM!MMNNNNNNNNO t   X  X		'**yy(( "e++0C+!/266u==E @!2e!;!;@U!;!1!488??E "E!1!1"[ " "eeEll#$55==#(%h44 *9L9LW9U9U *'+HH'4-@@ *&. * *#-b##6#6 !*2==;Q;Q !*/3H$)E+--(-$%EE&MME!E  C C1$BBBEK ''%PU(V(VWWWWJEb	L  8 8 86i66166777777778 4sK   AS H-L S 
L.L)$S )L..S 3FS 
S5S00S5c                 F   t           j                                                             d          }|ddd | D             d}|r||d<   t          t          d          5 }t          j        ||d	           d d d            n# 1 swxY w Y   t          d
|            d S )Nz%Y-%m-%d %H:%M:%SOnline)updatedstatusc                     g | ]}||S r   r   )r   items     r   r   z#save_pulse_json.<locals>.<listcomp>   s    444$t4444r   )r/   r`   earthquakesw   )indentzPulse data captured at )datetimenowstrftimer   	JSON_FILEr   dumpr"   )	data_listr   	timestamppayloadr'   s        r   save_pulse_jsonr      s   !%%''001DEEI
 !
 
 54)444 G  -!, 
i		 (	'1Q''''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( 

/I
/
/00000s    BBBc                    ddl m}m}m}m}m} t          || |||f          \  } }}}|| z
  }	||z
  }
 ||	dz            dz   ||            ||          z   ||
dz            dz  z  z   }d | ||                    z  }d|z  S )z8Return distance in kilometers between two lat/lon pairs.r   )radianssincosasinsqrt   g     @)mathr   r   r   r   r   map)lat1lon1lat2lon2r   r   r   r   r   dlatdlonrd   cs                r   haversine_kmr      s    22222222222222 4tT*BCCD$d$;D$;DDFQT33t99,SSa[[!^;;A	DDaMMAA:r   i,  c                    	 t          j        | d          }|                                 |                                }g }|                    dg           d|         D ]\}|                    di           }|                    di           }|                    d          pg }|rt	          |          dk     r\|d	         |d
         }
}	|                    d          }|                    d          }|                    d          }|                    d          }|                    d          }|t          |          nd}|d}n|dk    rd}nt          |          }|                    ||||||r5t          j        	                    |dz            
                    d          nd|
|	|g d
           ^|S # t          $ r}t          d|           g cY d}~S d}~ww xY w)z?Fetch USGS GeoJSON and return list of events with basic fields.
   r,   featuresN
propertiesgeometrycoordinatesr   r      magplaceidtimer   unknown	   z9+i  z%Y-%m-%dT%H:%M:%SZ)
r   r   
mag_bucketr   time_mstime_isor   r   r   nearbyzUSGS fetch failed)r7   r9   r:   r   rX   intr{   rY   r   utcfromtimestampr   r!   r"   )r   
max_eventsr	   geoeventsfeatpropsgeomcoordsr   r   r   r   event_idr   url_linkbucketr   r&   s                      r   fetch_usgs_eventsr      s   +Lb)))	ffhhGGJ++KZK8 !	 !	DHH\2..E88J++DXXm,,2F S[[1__ay&)C))E""CIIg&&Exx~~Hii''Gyy''H "%SXXXdF~&

1!

 [[
MM("ov  AH->>wt|LLUUVjkkk  }A         !1%%%						s   F=G   
G&
G!G&!G&c                     	 t          |           }n# t          $ r Y dS w xY w|dk     rdS |dk     rdS |dk     rdS |dk     rdS |d	k     rd
S |dk     rdS |dk     rdS |dk     rdS dS )zRReturn a heuristic radius in km for matching news sources to earthquake magnitude.2   g      ?r   g       @   g      @g      @d   g      @   g      @i  g      @i  g       @i  ip  )floatr!   )r   ms     r   mag_to_radius_kmr   )  s    #JJ   rr 	3wwr3wwr3wwr3wws3wws3wws3wwt3wwt4s    
  __main__Tz
--headlessz--no-sandboxz--disable-dev-shm-usagez~user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36)optionsr   r`   )r}   r=   r   r   countryr   rP   r   r   r   c                 f    g | ].}t          |t                    r|                    d           n|/S rK   )rS   rT   r9   rM   s     r   r   r   {  s7    %u%u%uWX
1d8K8K&RaeeGnnnQR%u%u%ur   rQ   r   )rP   r   distance_kmrQ   r   )r   )N)rI   N)9r   r   r   r   r7   rn   bs4r   urllib.parser   seleniumr   !selenium.webdriver.chrome.optionsr   selenium.webdriver.common.byr   BASE_DIRr   r#   r   REFRESH_INTERVALUSGS_MONTH_URLUSGS_DAY_URLr)   rB   r^   r   r   r   r   r   __name__sourcescollected_datar8   r=   r   add_argumentChromerZ   r6   datarS   rT   r9   rY   quitr   ev	radius_kms_lats_lonr   distroundr   sleepr   r   r   <module>r      s      				                              5 5 5 5 5 5 + + + + + + %...$$$
)))	 ^ZX X X(   0  <s s s sj1 1 1 1,   )S - - - -`  4 z<%,.."("$$'))\***^,,,6777  ^  	_  	_  	_!!'222	 0 0:c4(( Tcggenn T T007SSSD  9++FC88D  =JJsD11 =||&)ggennU&)ggennU C''*-'')*<*<Y 0"))$///)0, KKMMMMFKKMMMM #"$$ 	0 	0B((77I% 0 0=EM#|BuIr%y%%,,eUU9$$ #&''("3"3"Fswwv"wwu~~',uT1~~%u%u\_\c\cdoqs\t\t%u%u%uvxwxvx%y	 F xL''///0  	F;;;;
#$$$y<% s   ?CG' 'G=