³ò
‰g/Gc           @   s5  d  Z  d d k Z d d k Z d d k Z d d k Z d d k Z d d k l Z d d k Z d d k	 Z	 d d k
 Z
 d d k Z d d k Z d d k Z d d k Z d d k l Z e i d ƒ Z e i d ƒ Z e i d ƒ Z e i d ƒ Z e a d	 „  Z d
 „  Z d „  Z d „  Z d „  Z e d „ Z d S(   s†   
Fetch either a single feed, or a set of feeds, normalize to Atom and XHTML,
and write each as a set of entries in a cache directory.
iÿÿÿÿN(   t   minidom(   t   StringIOs   ^\w+:/*(\w+:|www\.)?s   [?/:|]+s   ^[,.]*s   [,.]*$c         C   sŒ  yS t  i | ƒ o? t | t ƒ o | i d ƒ i d ƒ } qR | i d ƒ } n Wn n Xt | t ƒ o | i d ƒ } n t  i d | ƒ } t i d | ƒ } t	 i d | ƒ } t
 i d | ƒ } t | ƒ d j ož | i d ƒ } xŒ t t | ƒ d d ƒ D]n } t d i | |  ƒ ƒ d j  oH d d	 k } d i | |  ƒ d | i d i | | ƒ ƒ i ƒ  } PqqWn t i i |  | ƒ S(
   s•   Return a filename suitable for the cache.

    Strips dangerous and common characters to create a filename we
    can use to store the cache in.
    s   utf-8t   idnat    t   ,iú   i    iÿÿÿÿiÜ   N(   t   re_url_schemet   matcht
   isinstancet   strt   decodet   encodet   unicodet   subt   re_slasht   re_initial_cruftt   re_final_cruftt   lent   splitt   ranget   joint   md5t   newt	   hexdigestt   ost   path(   t	   directoryt   filenamet   partst   iR   (    (    s3   /home/sgala/public_html/code/venus/planet/spider.pyR      s.      #c         C   s*   t  | d ƒ } | i |  ƒ | i ƒ  d S(   s     write the document out to disk t   wN(   t   opent   writet   close(   t   xdoct   outt   file(    (    s3   /home/sgala/public_html/code/venus/planet/spider.pyR   6   s    c         C   s   t  i  |  ƒ } | d d j S(   Ni    t   httpt   https(   s   https   https(   t   urlparse(   t   urit   parsed(    (    s3   /home/sgala/public_html/code/venus/planet/spider.pyt   _is_http_uri<   s    c         C   s]  t  i } t i ƒ  } | i d ƒ ps | i d ƒ o# t | i ƒ d j o d | _ q˜ | i o) | i	 i
 i i ƒ  d j o d | _ q˜ d | _ n t i t i ƒ  d t i |  ƒ ƒ } | i d j o¥ | i d	 ƒ o• | i | i d
 <| i d ƒ o7 t | i ƒ d j o! | i d |  ƒ d | i d <qu|  | i j o | i d |  ƒ qu| i d |  | i ƒ n| i d j oP | i d ƒ o@ t | i ƒ d j o* | i d |  | i ƒ | i | i d
 <n£| i d j o| i d	 ƒ o | i | i d
 <|  | i j o | i d |  ƒ n | i d |  | i ƒ | i i d ƒ pE | i i d ƒ o. | i i } t i | ƒ | j o d  Sq‘qïqu| i i i d ƒ o d  Sn | i i i d ƒ o% | i i i d ƒ o | i d =qunƒ | i d j o | i d |  ƒ n_ | i d j o | i d |  ƒ n; | i d j o | i d | i |  ƒ n | i d |  ƒ | i oM | i oC | i | _ | i i d d ƒ d j | _ | i i d ƒ | _ n t | i ƒ | i d  <| i d! ƒ oø | i d" ƒ o | i o | i | i d# <n: | i i d" ƒ o& | i d" o | i d" | i d# <n | i i d$ ƒ o | i d$ | i d% <n8 | i d& ƒ o' | i o t i  | i ƒ | i d% <n | i i d' ƒ o | i d' | i d( <qën | i oÕ | i i d) ƒ p t! ƒ  | i d) <n d* } | i i d+ ƒ o
 d, } n | i dM j o
 d/ } n xn | i i" D]% } | i# d0 j o | | d1 <PqfqfW| i i" i$ t i% h  d0 d2 <| d1 <|  d3 <ƒ ƒ n x4 t i& |  ƒ i' ƒ  D] \ }	 }
 |
 | i d4 |	 <qàWt( i( |  | ƒ d5 d6 k  l) } t* d  j o | i, ƒ  a* n h  } xÐ | i D]Å } | i d7 ƒ p | i- o, t. i- d  | ƒ | d7 <| d7 p qNqœn d8 } | i d9 ƒ o | i/ } n | i d: ƒ o | i0 } n | | i | i- dN ƒ d j o | | f | | i- <qNqNWt i1 ƒ  } xl| i2 ƒ  D]^\ } } t3 | | i- ƒ } d  } | i d; ƒ p | d; o | i d< d  ƒ | d; <n | i d; ƒ o$ y t4 i5 | i6 ƒ } Wq¿q¿Xn | p^ y t7 i8 | ƒ i9 } Wq$| i i d; ƒ o' y t4 i5 | i i6 ƒ } WqqXq q$Xn | p t i ƒ  } n t i | ƒ | d; <t. i. | | ƒ } | i: ƒ  i; d= ƒ } | i< ƒ  x; t i= |  ƒ D]* } t> i? | | d> d? ƒ} | p PqqW| p+ t7 i@ iA | ƒ o t7 iB | ƒ q0q0n tC | | ƒ t7 iD | | | f ƒ t* d  j oo | i i d7 | i i d@ d  ƒ ƒ } | o@ tE | ƒ tF j o | i; d= ƒ } n | t* t3 d8 | i- ƒ <qŽ	q0q0Wt* o t* iG ƒ  n t i |  ƒ oê g  } | i D]$ } | i d; ƒ o | | i6 qÅ	qÅ	~ } | iH ƒ  | o! t iI dA | d5 ƒ | i d <n0 | i i d ƒ o t i | i i ƒ g } n | p | d5 | j  o1 dB t i |  ƒ } | i | ƒ | | i d <q¡
n | i dC j oP | i i d ƒ o | i d =n | i i d ƒ o | i d | i d <qÎnÎ | i dD j o dE | i d <n­ | i dF j o dG | i d <nŒ | i d j o dH | i d <nk | i d j o dI | i d <nJ | i d j o dJ | i d <n) | i d j o dK | i | i d <n t7 i@ iA | ƒ p t7 iJ | ƒ n tK iL dL t  iM ƒ } t. iN | iO | i | i | i ƒ tC | i: ƒ  i; d= ƒ t3 | |  ƒ ƒ | i< ƒ  d  S(O   Nt   statust   entriesi    iÈ   t   timeouti˜  iô  i€Q t   urlt   planet_http_locations
   No data %ss   no datat   planet_messages   Updating feed %ss   Updating feed %s @ %si-  s    Feed has moved from <%s> to <%s>i0  s   Feed %s unchangeds   Feed %s unchanged @ %st   planet_updateds   no activity int	   duplicateiš  s   Feed %s gones   Feed %s timed outi  s   Error %d while updating feed %st   planet_bozot   truet   planet_formatt   planet_http_statust   headerst   etagt   planet_http_etags   last-modifiedt   planet_http_last_modifiedt   modifieds   -content-hasht   planet_content_hasht   linkss   application/atom+xmlt   rsss   application/rss+xmlt   rss090t   rss10s   application/rdf+xmlt   selft   typet   relt   hreft   planet_iÿÿÿÿ(   t   idindext   idR   t	   publishedt   updatedt   updated_parsedt   published_parseds   utf-8t   modet   filtert   links   %Y-%m-%dT%H:%M:%SZs   no activity in %d daysiâ   i“  s   403: forbiddeni”  s   404: not founds   408: request timeouts	   410: gones   internal server errors   http status %ssD   <feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>
(   s   rss090s   rss10(   R   (P   t   planett   loggert   configt   cache_sources_directoryt   has_keyR   R+   R*   t   bozot   bozo_exceptiont	   __class__t   __name__t   lowert   timet   gmtimet   activity_thresholdR-   t   feedt   warningt   infoR0   t
   feedparsert   _parse_date_iso8601R/   t
   startswitht   errort   versiont   getR   R7   R6   R:   t   asctimet   listR<   RB   t   appendt   FeedParserDictt   feed_optionst   itemst   scrubRE   t   indext   NoneR   RF   t   reconstituteRG   RH   t   cache_directoryt   valuesR   t   calendart   timegmRI   R   t   statt   st_mtimet   toxmlR
   t   unlinkt   filterst   shellt   runR   t   existst   removeR   t   utimeRA   R   R    t   sortt   strftimet   makedirsR    t   parseStringt   xmlnst   sourcet   documentElement(   t   feed_urit	   feed_infot   datat   logt   sourcest   activity_horizonRH   t   feedtypeRM   t   namet   valueRE   t   idst   entryt   cachet
   cache_filet   mtimeR!   t   outputRL   t   feedidt   _[1]t   msg(    (    s3   /home/sgala/public_html/code/venus/planet/spider.pyt
   writeCache@   sF   	&
& &6 !
' 
	) 
   
 $$&"'
!!$"%c         C   s  d d  k  } d d  k } d d k l } | i t i ƒ  ƒ } | i d t ƒ \ } }	 x¶| o®| i	 d | |  ƒ t
 d ƒ }
 t |
 d | ƒ t |
 d t i h  d d	 <ƒ ƒ yÑyc t | t ƒ o | i d
 ƒ } n | i d ƒ i d
 ƒ } | | j o | i	 d | | ƒ n Wn | i	 d | ƒ | } n Xh  } |	 i i d ƒ o |	 i d | d <n |	 i i d ƒ o |	 i d | d <n | i | d d | ƒ\ } } | i | p d ƒ i ƒ  | d <| i d j oS | i o d | _ q-|	 i i d ƒ o% |	 i d | d j o d | _ q-n t
 | ƒ }
 t |
 d | i d | ƒ ƒ | i d ƒ o | d =n t |
 d | ƒ WnN| j
 o | i d | |  ƒ n*| i j
 o" } | i d t | ƒ |  ƒ nú t i j
 o\ } | i i i ƒ  d j o! d |
 i  d	 <| i! d |  ƒ qÒ| i d t | ƒ |  ƒ n t" j
 oƒ } d d  k# } d d  k$ } | i% ƒ  \ } } } | i d | ƒ x? | i& | | ƒ | i' | ƒ D] } | i | i( ƒ  ƒ q°Wn X| i) d t d  | |	 |
 f ƒ | i d t ƒ \ } }	 qX Wd  S(!   Niÿÿÿÿ(   t   BadStatusLinet   blocks   Fetching %s via %dR   R-   R6   t   500R*   R   s   utf-8s   IRI %s mapped to %ss   unable to map %s to a URIR8   s   If-None-MatchR9   s   If-Modified-Sincet   GETs   -content-hashiÈ   i0  R;   s   content-locations   content-encodings&   Bad Status Line received for %s via %ds   HttpLib2Error: %s via %dR,   t   408s   Timeout in thread-%ds   HTTP Error: %s in thread-%ds   Error processing %st   item(*   t   httplib2R   t   httplibR–   t   HttpRP   t   http_cache_directoryRc   t   TrueR]   R   t   setattrR^   Rg   R   R   R
   R	   R[   RR   t   requestR   R   R*   t	   fromcacheRa   t   HttpLib2ErrorR   t   socketRU   RV   RW   R6   t   warnt	   Exceptiont   syst	   tracebackt   exc_infot   format_exception_onlyt	   format_tbt   rstript   put(   t   thread_indext   input_queuet   output_queueR†   Rœ   R   R–   t   hR'   R„   R[   R   R6   t   respt   contentt   eR¨   R©   RA   R‹   t   tbt   line(    (    s3   /home/sgala/public_html/code/venus/planet/spider.pyt
   httpThread  st    
	(
 

	 c         C   s#  t  i } t a t i ƒ  } y' t i t | ƒ ƒ | i	 d | ƒ WnT y3 d d k
 } | i t | ƒ ƒ | i	 d | ƒ Wq™ | i d | ƒ q™ Xn Xd d k l } d d k l } | ƒ  } | ƒ  } h  } t i ƒ  }	 |	 o% t i i |	 ƒ o t i |	 ƒ n t t i ƒ  ƒ oZ xd t t t i ƒ  ƒ ƒ D]9 }
 | d t d |
 | | | f ƒ | |
 <| |
 i ƒ  q5Wn | i	 d	 ƒ xà t i ƒ  D]Ò } t i ƒ  } t | | ƒ } t i | ƒ } | i o |  o | i	 d
 | ƒ qn | i i  d d ƒ d j o | i	 d | ƒ qn | o' t" | ƒ o | i# d | | f ƒ q| i# d | | | f ƒ qWx$ | i$ ƒ  D] } | i# d d# ƒ qsWh  } x‰| i% ƒ  p | i% ƒ  p | ogx, | i% ƒ  d j o | o t& i' d ƒ qºWxæ| i% ƒ  oØ| i  t( ƒ \ } } } y*t) | d ƒ p t | i* i+ ƒ d j  ou h  } t) | d ƒ oI | i i  d d ƒ | d <y" t& i, | i i  d d ƒ ƒ } WqšqšXn t i | |  } na t i- h  d d <| i* d <g  d <h  d <| i. d <d d <t | i* i+ ƒ d <ƒ } | i i  d d ƒ } | p | i i  d d ƒ } n | } | i/ d ƒ o | i0 } n d } | o | | j o
 | } n | o | | j o
 | } n | oL d | | | i d <| i1 d | | | f ƒ | o | | i d  <qýn | o | | | <n | o | | | <n t2 | | | ƒ Wqét3 j
 oƒ } d d k4 } d d k5 } | i6 ƒ  \ } } } | i7 d! | ƒ x? | i8 | | ƒ | i9 | ƒ D] } | i7 | i: ƒ  ƒ q¨WqéXqéWxH | i$ ƒ  D]: a | t i; ƒ  p# | t =| p | i	 d" ƒ qqÜqÜWq–Wd S($   s!    Spider (fetch) an entire planet s    Socket timeout set to %d secondsiÿÿÿÿNs+   Timeout set to invalid value '%s', skipping(   t   Queue(   t   Threadt   targett   argss   Building work queues   Feed %s already in cacheR5   t   410s   Feed %s goneR›   i    gš™™™™™¹?R6   i,  R[   R8   R7   R9   Rb   R+   RC   RS   R*   RF   s   duplicate subscription: R/   s!   Duplicate subscription: %s and %sR.   s   Error processing %ss%   Finished threaded part of processing.(   NN(<   RN   RO   R    Rk   RP   t   feed_timeoutR¥   t   setdefaulttimeoutt   floatR]   t   timeoutsockett   setDefaultSocketTimeoutR\   R¹   t	   threadingRº   RŸ   R   R   Ry   R~   t   intt   spider_threadsR   R¸   t   startt   subscriptionsRQ   R   R^   t   parseR[   Rc   Rl   R)   R®   t   keyst   qsizeRX   t   sleept   Falset   hasattrR6   R*   t   strptimeRg   R-   RR   RC   R¦   R•   R§   R¨   R©   Rª   Ra   R«   R¬   R­   t   isAlive(   t   only_if_newR†   R,   RÁ   R¹   Rº   t   fetch_queuet   parse_queuet   threadst
   http_cacheR   R'   R‡   t   feed_sourceR„   t   threadt
   feeds_seenR[   t   optionsR:   R…   RF   RC   R1   Rµ   R¨   R©   RA   R‹   R¶   R·   (    (    s3   /home/sgala/public_html/code/venus/planet/spider.pyt   spiderPlanet`  sÄ    			   $* 

	 ! (   t   __doc__RX   Rp   t   reR   R&   t   xml.domR    RN   RP   R^   Rm   Rw   R¥   Rj   R   t   compileR   R   R   R   R    Rk   R   R   R)   R•   R¸   RÌ   RÙ   (    (    (    s3   /home/sgala/public_html/code/venus/planet/spider.pys   <module>   s   <T	!			×	I
