1   
  2   
  3  """Utilities for clue analysis. 
  4  """ 
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23  import copy 
 24   
 25  import Halberd.logger 
 26   
 27   
 28  logger = Halberd.logger.getLogger() 
 29   
 30   
 31   
 32   
 33   
 34   
 35   
 36   
 37   
 38   
 39   
 41      """Study differences between fields. 
 42   
 43      @param clues: Clues to analyze. 
 44      @type clues: C{list} 
 45   
 46      @return: Fields which were found to be different among the analyzed clues. 
 47      @rtype: C{list} 
 48      """ 
 49      def pairs(num): 
 50          for i in xrange(num): 
 51              for j in xrange(num): 
 52                  if i == j: 
 53                      continue 
 54                  yield (i, j) 
  55   
 56      import difflib 
 57   
 58      different = [] 
 59      for i, j in pairs(len(clues)): 
 60          one, other = clues[i].headers, clues[j].headers 
 61          matcher = difflib.SequenceMatcher(None, one, other) 
 62   
 63          for tag, alo, ahi, blo, bhi in matcher.get_opcodes(): 
 64              if tag == 'equal': 
 65                  continue 
 66                   
 67              for name, value in one[alo:ahi] + other[blo:bhi]: 
 68                  different.append(name) 
 69   
 70      different.sort() 
 71      different.reverse() 
 72   
 73      return different 
 74   
 76      """Tries to detect and ignore MIME fields with ever changing content. 
 77   
 78      Some servers might include fields varying with time, randomly, etc. Those 
 79      fields are likely to alter the clue's digest and interfer with L{analyze}, 
 80      producing many false positives and making the scan useless. This function 
 81      detects those fields and recalculates each clue's digest so they can be 
 82      safely analyzed again. 
 83   
 84      @param clues: Sequence of clues. 
 85      @type clues: C{list} or C{tuple} 
 86      """ 
 87      from Halberd.clues.Clue import Clue 
 88   
 89      different = diff_fields(clues) 
 90   
 91       
 92      ignored = [] 
 93      for field in different: 
 94          method = '_get_' + Clue.normalize(field) 
 95          if not hasattr(Clue, method): 
 96              logger.debug('ignoring %s', field) 
 97              ignored.append(method) 
 98              setattr(Clue, method, lambda s, f: None) 
 99   
100      for clue in clues: 
101          Clue.parse(clue, clue.headers) 
102   
103      for method in ignored: 
104           
105           
106           
107          delattr(Clue, method) 
108   
109      return clues 
 110   
111   
113      """Returns the specified clue's digest. 
114   
115      This function is usually passed as a parameter for L{classify} so it can 
116      separate clues according to their digest (among other fields). 
117   
118      @return: The digest of a clue's parsed headers. 
119      @rtype: C{str} 
120      """ 
121      return clue.info['digest'] 
 122   
124      """Finds clusters of clues. 
125   
126      A cluster is a group of at most C{step} clues which only differ in 1 seconds 
127      between each other. 
128   
129      @param clues: A sequence of clues to analyze 
130      @type clues: C{list} or C{tuple} 
131   
132      @param step: Maximum difference between the time differences of the 
133      cluster's clues. 
134      @type step: C{int} 
135   
136      @return: A sequence with merged clusters. 
137      @rtype: C{tuple} 
138      """ 
139      def iscluster(clues, num): 
140          """Determines if a list of clues form a cluster of the specified size. 
141          """ 
142          assert len(clues) == num 
143   
144          if abs(clues[0].diff - clues[-1].diff) <= num: 
145              return True 
146          return False 
 147   
148      def find_cluster(clues, num): 
149          if len(clues) >= num: 
150              if iscluster(clues[:num], num): 
151                  return tuple(clues[:num]) 
152          return () 
153   
154      clues = sort_clues(clues) 
155   
156      invrange = lambda num: [(num - x) for x in range(num)] 
157   
158      start = 0 
159      while True: 
160          clues = clues[start:] 
161          if not clues: 
162              break 
163   
164          for i in invrange(step): 
165              cluster = find_cluster(clues, i) 
166              if cluster: 
167                  yield cluster 
168                  start = i 
169                  break 
170   
172      """Merges a sequence of clues into one. 
173   
174      A new clue will store the total count of the clues. 
175       
176      Note that each L{Clue} has a starting count of 1 
177   
178      >>> a, b, c = Clue(), Clue(), Clue() 
179      >>> sum([x.getCount() for x in [a, b, c]]) 
180      3 
181      >>> a.incCount(5), b.incCount(11), c.incCount(23) 
182      (None, None, None) 
183      >>> merged = merge((a, b, c)) 
184      >>> merged.getCount() 
185      42 
186      >>> merged == a 
187      True 
188   
189      @param clues: A sequence containing all the clues to merge into one. 
190      @type clues: C{list} or C{tuple} 
191   
192      @return: The result of merging all the passed clues into one. 
193      @rtype: L{Clue} 
194      """ 
195      merged = copy.copy(clues[0]) 
196      for clue in clues[1:]: 
197          merged.incCount(clue.getCount()) 
198      return merged 
 199   
201      """Classify a sequence according to one or several criteria. 
202   
203      We store each item into a nested dictionary using the classifiers as key 
204      generators (all of them must be callable objects). 
205   
206      In the following example we classify a list of clues according to their 
207      digest and their time difference. 
208   
209      >>> a, b, c = Clue(), Clue(), Clue() 
210      >>> a.diff, b.diff, c.diff = 1, 2, 2 
211      >>> a.info['digest'] = 'x' 
212      >>> b.info['digest'] = c.info['digest'] = 'y' 
213      >>> get_diff = lambda x: x.diff 
214      >>> classified = classify([a, b, c], get_digest, get_diff) 
215      >>> digests = classified.keys() 
216      >>> digests.sort()  # We sort these so doctest won't fail. 
217      >>> for digest in digests: 
218      ...     print digest 
219      ...     for diff in classified[digest].keys(): 
220      ...         print ' ', diff 
221      ...         for clue in classified[digest][diff]: 
222      ...             if clue is a: print '    a' 
223      ...             elif clue is b: print '    b' 
224      ...             elif clue is c: print '    c' 
225      ... 
226      x 
227        1 
228          a 
229      y 
230        2 
231          b 
232          c 
233   
234      @param seq: A sequence to classify. 
235      @type seq: C{list} or C{tuple} 
236   
237      @param classifiers: A sequence of callables which return specific fields of 
238      the items contained in L{seq} 
239      @type classifiers: C{list} or C{tuple} 
240   
241      @return: A nested dictionary in which the keys are the fields obtained by 
242      applying the classifiers to the items in the specified sequence. 
243      @rtype: C{dict} 
244      """ 
245       
246      classified = {} 
247   
248      for item in seq: 
249          section = classified 
250          for classifier in classifiers[:-1]: 
251              assert callable(classifier) 
252              section = section.setdefault(classifier(item), {}) 
253   
254           
255          last = classifiers[-1] 
256          section.setdefault(last(item), []).append(item) 
257   
258      return classified 
 259   
261      """Returns sections (and their items) from a nested dict. 
262   
263      See also: L{classify} 
264   
265      @param classified: Nested dictionary. 
266      @type classified: C{dict} 
267   
268      @param sects: List of results. It should not be specified by the user. 
269      @type sects: C{list} 
270   
271      @return: A list of lists in where each item is a subsection of a nested dictionary. 
272      @rtype: C{list} 
273      """ 
274      if sects is None: 
275          sects = [] 
276   
277      if isinstance(classified, dict): 
278          for key in classified.keys(): 
279              sections(classified[key], sects) 
280      elif isinstance(classified, list): 
281          sects.append(classified) 
282   
283      return sects 
 284   
286      """Computes the differences between the elements of a sequence of integers. 
287   
288      >>> deltas([-1, 0, 1]) 
289      [1, 1] 
290      >>> deltas([1, 1, 2, 3, 5, 8, 13]) 
291      [0, 1, 1, 2, 3, 5] 
292   
293      @param xs: A sequence of integers. 
294      @type xs: C{list} 
295   
296      @return: A list of differences between consecutive elements of L{xs}. 
297      @rtype: C{list} 
298      """ 
299      if len(xs) < 2: 
300          return [] 
301      else: 
302          return [xs[1] - xs[0]] + deltas(xs[1:]) 
 303   
305      """Returns slices of a given sequence separated by the specified indices. 
306   
307      If we wanted to get the slices necessary to split range(20) in 
308      sub-sequences of 5 items each we'd do: 
309   
310      >>> seq = range(20)  
311      >>> indices = [5, 10, 15] 
312      >>> for piece in slices(0, indices): 
313      ...     print seq[piece] 
314      [0, 1, 2, 3, 4] 
315      [5, 6, 7, 8, 9] 
316      [10, 11, 12, 13, 14] 
317      [15, 16, 17, 18, 19] 
318   
319      @param start: Index of the first element of the sequence we want to 
320      partition. 
321      @type start: C{int}. 
322   
323      @param xs: Sequence of indexes where 'cuts' must be made. 
324      @type xs: C{list} 
325   
326      @return: A sequence of C{slice} objects suitable for splitting a list as 
327      specified. 
328      @rtype: C{list} of C{slice} 
329      """ 
330      if xs == []: 
331           
332          return [slice(start, None)] 
333      return [slice(start, xs[0])] + slices(xs[0], xs[1:]) 
 334   
336      """Sorts clues according to their time difference. 
337      """ 
338       
339       
340      tmps = [(x.diff, x) for x in clues] 
341      tmps.sort() 
342      return [x[1] for x in tmps] 
 343   
344   
346      """Detect and merge clues pointing to a proxy cache on the remote end. 
347   
348      @param clues: Sequence of clues to analyze 
349      @type clues: C{list} 
350   
351      @param maxdelta: Maximum difference allowed between a clue's time 
352      difference and the previous one. 
353      @type maxdelta: C{int} 
354   
355      @return: Sequence where all irrelevant clues pointing out to proxy caches 
356      have been filtered out. 
357      @rtype: C{list} 
358      """ 
359      results = [] 
360   
361       
362      get_rtime = lambda c: c._remote 
363      classified = classify(clues, get_rtime, get_digest) 
364   
365      subsections = sections(classified) 
366      for cur_clues in subsections: 
367          if len(cur_clues) == 1: 
368              results.append(cur_clues[0]) 
369              continue 
370   
371          cur_clues = sort_clues(cur_clues) 
372   
373          diffs = [c.diff for c in cur_clues] 
374   
375           
376           
377          indices = [idx for idx, delta in enumerate(deltas(diffs)) 
378                         if abs(delta) > maxdelta] 
379   
380          for piece in slices(0, indices): 
381              if cur_clues[piece] == []: 
382                  break 
383              results.append(merge(cur_clues[piece])) 
384   
385      return results 
 386   
388      """Return a list of unique clues. 
389   
390      This is needed when merging clues coming from different sources. Clues with 
391      the same time diff and digest are not discarded, they are merged into one 
392      clue with the aggregated number of hits. 
393   
394      @param clues: A sequence containing the clues to analyze. 
395      @type clues: C{list} 
396   
397      @return: Filtered sequence of clues where no clue has the same digest and 
398      time difference. 
399      @rtype: C{list} 
400      """ 
401      results = [] 
402   
403      get_diff = lambda c: c.diff 
404      classified = classify(clues, get_digest, get_diff) 
405   
406      for section in sections(classified): 
407          results.append(merge(section)) 
408   
409      return results 
 410   
412      """Compute the total number of hits in a sequence of clues. 
413   
414      @param clues: Sequence of clues. 
415      @type clues: C{list} 
416   
417      @return: Total hits. 
418      @rtype: C{int} 
419      """ 
420      return sum([clue.getCount() for clue in clues]) 
 421   
423      """Draw conclusions from the clues obtained during the scanning phase. 
424   
425      @param clues: Unprocessed clues obtained during the scanning stage. 
426      @type clues: C{list} 
427   
428      @return: Coherent list of clues identifying real web servers. 
429      @rtype: C{list} 
430      """ 
431      results = [] 
432   
433      clues = uniq(clues) 
434   
435      clues = filter_proxies(clues) 
436   
437      cluesbydigest = classify(clues, get_digest) 
438   
439      for key in cluesbydigest.keys(): 
440          for cluster in clusters(cluesbydigest[key]): 
441              results.append(merge(cluster)) 
442   
443      return results 
 444   
445   
447      """Identify and ignore changing header fields. 
448   
449      After initial analysis one must check that there aren't as many realservers 
450      as obtained clues. If there were it could be a sign of something wrong 
451      happening: each clue is different from the others due to one or more MIME 
452      header fields which change unexpectedly. 
453   
454      @param clues: Raw sequence of clues. 
455      @type clues: C{list} 
456   
457      @param analyzed: Result from the first analysis phase. 
458      @type analyzed: C{list} 
459   
460      @param threshold: Minimum clue-to-realserver ratio in order to trigger 
461      field inspection. 
462      @type threshold: C{float} 
463      """ 
464      def ratio(): 
465          return len(analyzed) / float(len(clues)) 
 466   
467      assert len(clues) > 0 
468   
469      r = ratio() 
470      if r >= threshold: 
471          logger.debug('clue-to-realserver ratio is high (%.3f)', r) 
472          logger.debug('reanalyzing clues...') 
473   
474          ignore_changing_fields(clues) 
475          analyzed = analyze(clues) 
476   
477          logger.debug('clue reanalysis done.') 
478   
479       
480       
481      if ratio() >= threshold and len(clues) > 10: 
482          logger.warn( 
483  '''The following results might be incorrect.  It could be because the remote 
484  host keeps changing its server version string or because halberd didn't have 
485  enough samples.''') 
486   
487      return analyzed 
488   
489   
504   
505  if __name__ == '__main__': 
506      _test() 
507   
508   
509   
510