merge florian: more unicode testing
[forgeplucker:forgeplucker.git] / trunk / forgeplucker / handle_fusionforge.py
1 # -*- coding: utf-8 -*-
2 """
3 Handler class for FusionForge
4
5 Contributed in the frame of the COCLICO project.
6
7 (C) 2010 Florian Dudouet - INRIA
8
9 """
10
11 #import psycopg2 #@UnusedImport
12 import re
13 import copy
14 #import urllib2
15 import os
16 import sys
17
18 from htmlscrape import *
19 from generic import *
20 from forgeplucker.FusionForge_DocMan import FusionForge_DocMan
21
22 class FusionForge(GenericForge):
23         """
24 The FusionForge handler provides machinery for the FusionForge sites.
25
26 """
27         class Tracker(GenericForge.GenericTracker):
28                 
29                 def __init__(self, label, parent, projectbase):
30                         GenericForge.GenericTracker.__init__(self, parent, label)
31                         self.parent = parent
32                         self.optional = False
33                         self.chunksize = 50
34                         self.zerostring = None
35                         self.projectbase = projectbase
36                         self.atid = re.search('atid=([0-9]*)', projectbase).group(1)
37                         self.name_mappings = {
38                                 "bug_group_id": "group",
39                                 "category_id": "category",
40                                 "resolution_id": "resolution",
41                                 "status_id": "status",                          
42                         }
43                         # l'auteur du bug n'est pas éditable et ne peut donc être récupérée par le crawler dans le formulaire, cette regexp ne sert qu'à lui
44                         self.submitter_re = '''<tt><a href="\S*">([^<]*)</a></tt>'''
45                         # la date n'est pas éditable et ne peut donc être récupérée par le crawler dans le formulaire
46                         self.date_re = '''<td><strong>Date Submitted:</strong><br />\s*([^<]*)\s*<'''
47                         # vérifier les champs de formulaire à ignorer
48                         self.ignore = ("canned_response",
49                         "new_artifact_type_id",
50                         "words", "type_of_search")
51                         # identifie les ids d'artefacts (bug, patches...) : aid
52                         self.artifactid_re = r'/tracker/index.php\?func=detail&amp;aid=([0-9]+)&amp;group_id=[0-9]+&amp;atid=[0-9]+"'
53                         # # chaque tracker a également un atid propre, indépendant du projet et global (utilité?????). ex, bugs de projet A : id 1, patch de projet 1 : id 2, bug de projet 2 : id 3...
54                         
55                         # m = re.search('<a href="[^/]*//[^/]*/([^"]*)">.*%s</a>' % label, self.parent.basepage)
56                         
57                         # if m:
58                         #       print m.groups()
59                         #       self.projectbase = dehtmlize(m.group(1))
60                         # else:
61                         #       raise ForgePluckerException("Le tracker '%s' n'a pas été trouvé" \
62                         #                   % label)
63                         # m = re.search('<a href="[^"]*atid=([0-9]*)[^"]*">.*%s</a>' % label, self.parent.basepage)
64                         # print m.groups()
65                         # self.atid = m.group(1)
66
67                         #Update view mode to parse closed tickets by default
68                         self.trackerSwitchViewMode()
69
70                 def getUrl(self):
71                         return self.projectbase
72
73                 def trackerSwitchViewMode(self):
74                         params = {'set':'custom','assigned_to':0,'status':100,'query_id':-1,'sort_col':'priority','_sort_ord':'DESC','submit':'Quick+Browse'}
75                         self.parent.fetch('tracker/index.php?group_id='+self.parent.project_id+'&atid='+self.atid, 'Updating '+self.atid + ' tracker\'s view mode', params)
76                         return True
77
78                 def access_denied(self, page, issue_id=None):
79                         '''
80                         Check if the user has edit access to the current tracker
81                         '''
82                         if "No items found" in page:
83                                 return 0
84                         else:
85                                 return issue_id is None and not "Mass Update" in page
86
87                 def has_next_page(self, page):
88                         """
89                         Check if the current page contains a count indicating that another page of artifacts exists
90                         """
91                         return "Next &raquo;" in page
92
93                 def chunkfetcher(self, offset):
94                         "Get a bugtracker index page - all bug IDs, open and closed.."
95                         return self.projectbase + "&offset=%d&limit=100" % offset
96
97                 def detailfetcher(self, issueid):
98                         "Generate a bug detail URL for the specified bug ID."
99                         return self.projectbase + '&func=detail&aid=' + str(issueid)
100
101                 def narrow(self, text):
102                         "Get the section of text containing editable elements."
103                         return self.parent.narrow(text)
104
105                 def parse_followups(self, contents, bug):
106                         "Parse followups out of a displayed page in a bug or patch tracker."
107         
108                         comments = []
109                         soup = BeautifulSoup(contents)
110                         soup = soup.find('div', title='Followups')
111                         # 5.0 has this :
112                         t = soup.find('table', attrs={'class': re.compile('.*listTable')})
113                         if not t :
114                                 # then 4.8 which has only this :
115                                 t = soup.find('table').find('table')
116                         if t:
117                                 for tr in t.findAll("tr"):
118                                         td=tr.find("td")
119                                         if td:
120                                                 # if 4.8, then, has this :
121                                                 pre = td.find('pre')
122                                                 if pre:
123                                                         td = pre
124                                                 comment = {"class":"COMMENT"}
125                                                 m = re.search('Date: ([-0-9: ]+)', str(td.contents))
126                                                 if m:
127                                                         comment['date'] = self.parent.canonicalize_date(m.group(1))
128                                                         m = re.search('Sender: .*/users/([^/]*)/"', str(td.contents))
129                                                         if m:
130                                                                 comment['submitter'] = m.group(1)
131                                                         comment['comment'] = td.contents[-1].strip()
132                                 
133                                                         comments.append(comment)
134                         if comments :
135                                 comments.reverse()
136                         return comments
137
138                 def parse_history_table(self, contents, artifacts):
139                         "Get the change history attached to a tracker artifact."
140                         changes, filechanges, attachments, filelist = [], [], [], []
141                         
142
143                         for (field, old, date, by) in self.parent.table_iter(contents,
144                                         r'<h3>Change Log:</h3>',
145                                         4,
146                                         "history",
147                                         has_header=True):
148                                 field, old, date, by = field.strip(), old.strip(), date.strip(), by.strip()
149                                 if field in ('File Added', 'File Deleted'):
150                                         filechanges.append((field, old, date, by))
151                                 #if field not in ('close_date'):
152                                 change = {'field':field, 'old':old, 'date':self.parent.canonicalize_date(date), 'by':by, 'class':'FIELDCHANGE'}
153                                 changes.append(change)
154                         for (action, _file, date, by) in reversed(filechanges):
155                                 fileid, filename = _file.split(':')
156                                 filename = filename.strip()
157                                 if action == 'File Added':
158                                         attachment = {"class":"ATTACHMENT", "filename": filename, "by": by, "date": self.parent.canonicalize_date(date), "id": fileid}
159                                         filelist.append(filename)
160                                         attachments.append(attachment)
161                                 elif action == 'File Deleted':
162                                         for attachment in attachments:
163                                                 if attachment['id'] == fileid:
164                                                         attachment['deleted'] = self.parent.canonicalize_date(date)
165                         for attachment in attachments:
166                                 try:
167                                         filename = attachment['filename']
168                                         m = re.search('<a href="[^"]*/(tracker/[^"]*/%s)">Download</a></td>' % filename, contents)
169                                         url = m.group(1)
170                                         dl = self.parent.fetch(url, "download")
171                                         rep_dest = self.parent.project_name + '/' + self.type
172                                         if not os.path.exists(self.parent.project_name):
173                                                 os.mkdir(self.parent.project_name)
174                                         if not os.path.exists(rep_dest):
175                                                 os.mkdir(rep_dest)
176                                         fnout = rep_dest + '/' + filename
177                                         fout = open(fnout, "wb")
178                                         fout.write(dl)
179                                         fout.close
180                                         attachment['uri'] = self.parent.real_url(url)
181                                         attachment['url'] = fnout
182                                 except Exception, e:
183                                         continue
184
185                         return changes, attachments
186                 
187                 def update_extrafields(self, artifact, contents, vocabularies):
188                         "Find names of extra fields by parsing the html contents"
189                         tempKeys, tempValues, tempVocab = [], {}, {}
190                         for arti in artifact:
191                                 if (arti[:13] == "extra_fields["):
192                                         try:
193                                                 name = re.search('.*<strong>([a-zA-Z0-9_-|\s]*)[:|<br />|</strong>].*%s' % re.escape(arti), 
194                                                                  contents, re.DOTALL).group(1)
195                                         except:
196                                                 print "extra field parsing error"
197                                                 continue
198
199                                         tempValues[name] = artifact[arti]
200                                         tempKeys.append(arti)
201                                         try:
202                                                 tempVocab[name] = vocabularies[arti]
203
204                                         except:
205                                                 continue
206                                 else:
207                                         tempValues[arti] = artifact[arti]
208                         for key in tempKeys:
209                                 del artifact[key]
210                                 try:
211                                         del vocabularies[key]
212                                 except:
213                                         continue
214                         for value in tempValues:
215                                 artifact[value] = tempValues[value]
216                         for vocab in tempVocab:
217                                         vocabularies[vocab] = tempVocab[vocab]
218                         return artifact, vocabularies
219
220                 def custom(self, contents, artifact, vocabularies):
221                         "parse specific fields of FusionForge"
222                         #des champs spécifiques à fusionforge qu'il serait intéressant d'ajouter, peut ne rien faire en dehors d'appeler parse_followups et parse_attachments. méthode appelée par generic
223                         artifact, vocabularies = self.update_extrafields(artifact, contents, vocabularies)
224
225                         #get Detailed Description (uneditable thus unfetchable directly from form)
226                         dD = None
227                         dDFound = False
228                         bs = BeautifulSoup(self.narrow(contents))
229                         # for 4.8
230                         for t in bs.findAll('table'):
231                                 try :
232                                         if t.find('thead').find('tr').find('td').contents[0] == 'Detailed description':
233                                                 dDFound = True
234                                                 dD = t.find('tr', attrs={'class': 'altRowStyleEven'}).find('td').find('pre').contents[0]
235                                                 dD = blocktext(dehtmlize(dD.strip()))
236                                                 break
237                                 except AttributeError, e : # 
238                                         if str(e) != "'NoneType' object has no attribute 'find'":
239                                                 raise
240                         # Try for 5.0
241                         if not dDFound:
242                                 bs = BeautifulSoup(self.narrow(contents))
243                                 for t in bs.findAll('table', attrs={'class': re.compile('.*listTable')}):
244                                         try :
245                                                 if t.find('th').find('div').find('div').contents[0] == 'Detailed description':
246                                                         dDFound = True
247                                                         dD = t.find('tr').findNext('tr').find('td').contents[0]
248                                                         dD = blocktext(dehtmlize(dD.strip()))
249                                                         break
250                                         except AttributeError, e : # 
251                                                 if str(e) != "'NoneType' object has no attribute 'find'":
252                                                         raise
253                         if dD:
254                                 artifact['description'] = dD
255                         
256                         m = re.search('''Date Closed:</strong><br />\s*([^<]*)\s*<''', contents)
257                         if not (m == None):
258                                 artifact['closed_at'] = self.parent.canonicalize_date(m.group(1).strip())
259                         artifact['comments'] = self.parse_followups(contents, artifact)
260                         artifact['history'], artifact['attachments'] = self.parse_history_table(contents, artifact)
261                         
262
263         class BugTracker(Tracker):
264                 def __init__(self, parent, projectbase):
265                         FusionForge.Tracker.__init__(self, "Bugs", parent, projectbase)
266                         self.type = "bugs"
267         class FeatureTracker(Tracker):
268                 def __init__(self, parent, projectbase):
269                         FusionForge.Tracker.__init__(self, "Feature Requests", parent, projectbase)
270                         self.type = "features"
271         class PatchTracker(Tracker):
272                 def __init__(self, parent, projectbase):
273                         FusionForge.Tracker.__init__(self, "Patches", parent, projectbase)
274                         self.type = "patches"
275         class SupportTracker(Tracker):
276                 def __init__(self, parent, projectbase):
277                         FusionForge.Tracker.__init__(self, "Support", parent, projectbase)
278                         self.type = "support"
279         class CustomTracker(Tracker):
280                 '''
281                 This class is used to create a tracker object for any custom tracker
282                 '''
283                 def __init__(self, parent, nameTracker, projectbase):
284                         FusionForge.Tracker.__init__(self, nameTracker, parent, projectbase)
285                         self.type = 'custom'
286
287         def get_trackers(self):
288                 '''
289                 Get the list of trackers from the trackers page.
290                 Contrary to what ForgePlucker does normally it does not use the summary page to extract the links as this page only contains the default trackers and not the custom ones.
291                 '''
292                 trackers = []
293                 trackersPage = self.fetch('tracker/?group_id='+self.project_id, 'Fetching tracker list')
294                 trackersPage = BeautifulSoup(trackersPage)
295                 for table in trackersPage.findAll('table') :
296                         trs = table.findAll('tr')[1:]
297                         for tr in trs:
298                                 a = tr.find('a')
299                                 tPage = re.search('[^/]*//[^/]*.*/([^/]+/[^"/]*)', a['href']).group(1)
300                                 tLabel = dehtmlize(a.contents[1]).strip()
301                                 trackers.append({'label':tLabel, 'projectbase':tPage})
302
303                 self.trackers = []      
304
305                 defaults = {'Bugs': FusionForge.BugTracker, 
306                             'Feature Requests': FusionForge.FeatureTracker, 
307                             'Patches': FusionForge.PatchTracker, 
308                             'Support': FusionForge.SupportTracker}
309                 for tracker in trackers:
310                         if self.verbosity >= 1:
311                                 self.notify("found tracker: " + tracker['label'] + ':' + tracker['projectbase'])
312                         if tracker['label'] in defaults:
313                                 self.trackers.append(defaults[tracker['label']](self, tracker['projectbase']))
314                         else:
315                                 self.trackers.append(FusionForge.CustomTracker(self, tracker['label'], tracker['projectbase']))
316                 return self.trackers
317
318         ### PERMISSIONS/ROLES PARSING
319
320         def user_page(self, username):
321                 return 'users/' + username
322                 
323         def pluck_permissions(self):
324                 '''
325                 Get the permissions associated with each role in the project and return the corresponding array
326                 '''
327                 if self.verbosity >= 1:
328                         self.notify('plucking permissions from project/memberlist.php?group_id='+self.project_id)
329                 contents = self.fetch('project/memberlist.php?group_id=' + self.project_id, 'Roles page')
330                 perms = {}
331                 for (realname, username, role, skills) in self.table_iter(self.narrow(contents), '<table', 4, 'Roles Table', has_header=True):
332                         username = username.strip()
333                         perms[username] = {'role':role}
334                         perms[username]['real_name'] = realname
335                         perms[username]['URL'] = self.real_url(self.user_page(username))
336                         
337                 for user in perms:
338                         contents = self.narrow(self.fetch(self.user_page(user), 'User page'))
339                         mail = re.search('''sendmessage.php\?touser=[0-9]*">([^<]*)</a>''', contents, re.DOTALL).group(1).strip().replace(" @nospam@ ","@")
340                         perms[user]['mail'] = mail
341                         
342                 return perms
343         
344         def pluck_roles(self):
345                 '''
346                 Get the roles of each registered user of the project and returns the corresponding array
347                 '''
348                 roles = {}
349                 contents = self.fetch('project/admin/?group_id=' + self.project_id, 'Admin page')
350                 m = re.search('''<form action="roleedit.php\?group_id=[0-9]*" method.*<select name="role_id">(.*)</select>''', contents, re.DOTALL)
351
352                 # above is for 4.8, then if fails, for 5.0 :
353                 if not m:
354                         contents = self.fetch('project/admin/users.php?group_id=' + self.project_id, 'Admin page')
355                         m = re.search('''<form action="roleedit.php\?group_id=[0-9]*[^"]*" method.*<select name="role_id">(.*)</select>''', contents, re.DOTALL)
356
357                 n = re.findall('''<option value="([0-9]*)"[^>]*>(.*)</option>''', m.group(1))
358                 n.append(['1', 'Default'])#Default role for project creator, always #1
359                 n.append(['observer', 'Observer'])
360                 for i in range(0, len(n)):
361                         permissions = {}
362                         editpagecontents = self.fetch('project/admin/roleedit.php?group_id=' + self.project_id + '&role_id=' + n[i][0], 'Edit Role ' + n[i][1] + ' page')
363                         for (section, subsection, setting) in self.table_iter(editpagecontents, '<table>', 3, 'Edit Table', has_header=True, keep_html=True):
364                                 subsection = dehtmlize(str(subsection)).strip()
365                                 section = dehtmlize(str(section)).strip()
366                                 t = re.findall('''<option value="[-\w]*" selected="selected">([^<]*)</option>''', str(setting))
367                                 if subsection != '-':
368                                         if len(t)>1:
369                                                 permissions[section + ':' + 'AnonPost' + ':' + subsection]=t[1]
370                                         permissions[section + ':' + subsection] = t[0]
371                                 elif len(t) != 1: #Exception for project Admin.
372                                         permissions[section] = t[-1]
373                                 else:
374                                         permissions[section] = t[0]
375                         roles[n[i][1]] = permissions
376                 return roles
377         
378         ### WIKI PARSING : COCLICO NEW FEATURE
379         
380         
381         def pluck_wiki(self, state=False):
382                 '''
383                 Get the phpWiki associated with the project using its specialized export function
384                 @param state: if true, export last state of the wiki, else exports the last state and the pages history
385                 '''
386                 #TODO : Check admin state
387                 #Will pluck PhpWiki dump
388                 #if state = true : last state only, if state = false : last state + pages history
389                 #DL in same folder/phpwiki
390                 dl = self.fetch('plugins/wiki/index.php?zip=all&type=g&id=' + self.project_id, 'Wiki dump')
391                 rep_dest = self.project_name + '/PhpWiki'
392                 if not os.path.exists(self.project_name):
393                         os.mkdir(self.project_name)
394                 if not os.path.exists(rep_dest):
395                         os.mkdir(rep_dest)
396                 fnout = rep_dest + '/FullDump.zip'
397                 fout = open(fnout, "wb")
398                 fout.write(dl)
399                 fout.close
400                 
401         ### DOCMAN PARSING : COCLICO NEW FEATURE
402         
403         def pluck_docman(self):
404                 '''
405                 Get the Document Manager's data of the project and returns the corresponding array, plus downloads any attached files in the local directory $(PROJECT_NAME)/docman
406                 '''
407                 # First page of a docman admin web page.
408                 init_page = self.fetch('docman/admin/?group_id='+ self.project_id, 'main docman page')
409                 result = {}
410                 #get each category (active/deleted/hidden/private)
411                 m=re.findall('''<li><strong>(.*)</strong>(.*)''',init_page)
412                 #for each category
413                 for lis in m:
414                 #execute the recursive function at the root of this category
415                         result[lis[0]] = self.pluck_docman_list(init_page, docman_type = lis[0])
416                 return result
417
418         def pluck_docman_list(self, contents, url = None, docman_type = None):
419                 '''
420                 Get the documents and directories at the specified page and return a corresponding array
421                 @param contents: The considered HTML to be parsed.
422                 @param url: The URL where this HTML can be retrieved
423                 @param docman_type: 
424                 '''             
425                 #TODO:Check usefulness of docman_type 
426                 result = {}
427                 #Init a FusionForge_DocMan instance, used to parse contents.
428                 docman = FusionForge_DocMan(contents, url = url, docman_type = docman_type)
429                 #Get a list of the content at the current level for the current type.
430                 d = docman.get_docman_list()
431                 #for each content at this level
432                 for el in d:
433                 #if not a list, then it's a directory
434                         if not (type(d[el]) is list):
435                         #recursive call to myself to get the content for this directory
436                                 result[el.encode('utf-8')] = self.pluck_docman_list(self.fetch('docman/admin/' + d[el], 'docman_explorer'), url = d[el], docman_type = docman_type)
437                         else:
438                 #else, it's a file
439                                 finfo = FusionForge_DocMan(self.fetch('docman/admin/' + d[el][0], 'docfile'))
440                         #get its information
441                                 el = el.encode('utf-8')
442                                 result[el] = finfo.get_file_info()
443                                 if result[el]:
444                         #create a directory and file to download the docfile
445                                         rep_dest = self.project_name + '/docman'
446                                         if not os.path.exists(self.project_name):
447                                                 os.mkdir(self.project_name)
448                                         if not os.path.exists(rep_dest):
449                                                 os.mkdir(rep_dest)
450                                         dl = self.fetch('docman/' + result[el]['url'], 'docman file fetching')
451                                         fnout = rep_dest + '/' + result[el]['file_name']
452                                         fout = open(fnout, "wb")
453                                         fout.write(dl)
454                                         fout.close()
455                                 #update the url to a local url
456                                         result[el]['url'] = fnout
457                                         for k,v in result[el].items():
458                                                 k = k.encode('utf-8')
459                                                 v = v.encode('utf-8')
460                                                 result[el][k]=v
461                                 else:
462                                 #file was an url:delete
463                                         del result[el]
464                 return result
465
466         ### FRS PARSING : COCLICO NEW FEATURE
467
468         def pluck_frs(self):
469                 '''
470                 Get the contents of the File Release System of the project and return a corresponding data array.
471                 This is the function which should be called from the extractor.
472                 '''
473                 temp = self.fetch('forum/forum.php?forum_id=1&group_id=6', 'frs file fetching')
474                 init_page = self.fetch('frs/admin/?group_id=' + self.project_id, 'main FRS Admin page')
475                 result = {}
476                 
477                 soup = BeautifulSoup(init_page)
478                 trs = soup.find('table').findAllNext('tr')[1:]
479                 
480                 for tr in trs:
481                         pk_name = tr.find('input', attrs={'name':'package_name'})['value']
482                         pk_stid = tr.find('option', attrs={'selected':'selected'})['value']
483                         pk_stname = tr.find('option', attrs={'selected':'selected'}).contents[0]
484                         pk_releases = tr.find('a',attrs={'href':re.compile('showreleases')})['href']
485                         pk_id = re.search('package_id=([0-9]*)', pk_releases).group(1)
486                         result[pk_name] = {}
487                         result[pk_name]['status']=pk_stname
488                         
489                         if pk_stid=='3':
490                                 #Change status to active
491                                 self.switch_pkg_status(pk_name, pk_id, 1)
492                         result[pk_name]['releases']=self.pluck_frs_releases(pk_releases)
493                         if pk_stid=='3':
494                                 #Change status back to hidden
495                                 self.switch_pkg_status(pk_name, 3)
496                 return result
497         
498         def switch_pkg_status(self, pk_name, pk_id, pk_stid=1):
499                 '''
500                 switch status id of a package from active to hidden and vice/versa
501                 @param pk_name: Name of the package
502                 @param pk_id: ID of the package
503                 @param pk_stid: Status id to be set for this package
504                 '''
505                 params = {'group_id':self.project_id, 'func':'update_package', 'package_id':pk_id, 'package_name':pk_name, 'status_id':pk_stid, 'submit':'Update'}
506                 self.fetch('frs/admin/index.php', 'Updating '+str(pk_name) + ' status to ' + str(pk_stid), params)
507                 return True
508         
509         
510         def switch_rel_status(self, rel_edit, r_name, r_stid, r_date, r_notes, r_change):
511                 '''
512                 switch status id of a release from active to hidden and vice/versa
513                 @param pk_name: Name of the release
514                 @param pk_id: ID of the release
515                 @param pk_stid: Status id to be set for this release
516                 '''
517                 params = {'step1':1, 'release_date':r_date, 'release_name':r_name, 'status_id':r_stid, 'release_notes':r_notes, 'release_changes':r_change, 'preformatted':'on','submit':'Submit/Refresh'}
518                 self.fetch('frs/admin/'+str(rel_edit), 'Updating '+str(r_name) + ' status to ' + str(r_stid), params)
519                 return True
520         
521         def pluck_files_for_release(self, r_id, r_files):
522                 '''
523                 Get the files of a specified release
524                 @param r_id: the id of the release
525                 @param r_files: the array containing the current files, the new ones will be concatenated here
526                 '''
527                 init_page = self.fetch('frs/?group_id=' +self.project_id, 'Plucking files from main FRS pages...')
528                 soup = BeautifulSoup(init_page)
529                 r_tag = soup.find('a',attrs={'href':re.compile('release_id='+r_id)})
530                 for fname in r_files:
531                         furl = r_tag.findNext('a', {'href':re.compile(fname)})['href']
532                         
533                         rep_dest = self.project_name + '/frs'
534                         if not os.path.exists(self.project_name):
535                                 os.mkdir(self.project_name)
536                         if not os.path.exists(rep_dest):
537                                 os.mkdir(rep_dest)
538                         dl = self.fetch(furl, 'frs file fetching')
539                         fnout = rep_dest + '/' + fname
540                         fout = open(fnout, "wb")
541                         fout.write(dl)
542                         fout.close()
543                 #update the url to a local url
544                         r_files[fname]['url'] = fnout
545                 return r_files
546                 
547         def pluck_frs_releases(self, pk_releases):
548                 '''
549                 Get the list of releases contained in the specified package and extract the content of these releases with the pluck_frs_release method 
550                 @param pk_releases: End of the url linking to a package
551                 '''
552                 result = {}
553                 init_page = self.fetch('frs/admin/' + pk_releases, 'Releases')
554                 soup = BeautifulSoup(init_page)
555                 trs = soup.find('table')
556                 if trs != None:
557                         trs = trs.findAllNext('tr')[1:]
558                         for tr in trs:
559                                 rel_edit = tr.find('a')['href']
560                                 rel_name, rel_data = self.pluck_frs_release(rel_edit)
561                                 result[rel_name] = rel_data
562                         
563                 return result
564         
565         
566         def pluck_frs_release(self, rel_edit):
567                 '''
568                 Get the content of a release, use pluck_files_for_release to get the linked files
569                 @param rel_edit: End of the url linking to the edit page of a release
570                 '''
571                 result = {}
572                 init_page = self.fetch('frs/admin/' + rel_edit, 'Release edit')
573                 soup = BeautifulSoup(init_page)
574                 r_date = soup.find('input',attrs={'name':'release_date'})['value']
575                 r_name = soup.find('input',attrs={'name':'release_name'})['value']
576                 r_status = soup.find('select',attrs={'name':'status_id'}).findNext('option',attrs={'selected':'selected'}).contents[0]
577                 try:
578                         r_notes = soup.find('textarea',attrs={'name':'release_notes'}).contents[0]
579                 except:
580                         r_notes=""
581                 try:
582                         r_change = soup.find('textarea',attrs={'name':'release_changes'}).contents[0]
583                 except:
584                         r_change=""
585                 
586                 r_files = {}
587                 
588                 table = soup.findAll('table')[2]
589                 trs = soup.findAll('table')[2].findAll('tr')[1:]
590                 i = 0
591                 while i<len(trs):
592                         tr = trs[i]
593                         fname = tr.find('td').contents[0]
594                         ftype = tr.find('select', {'name':'type_id'}).find('option', {'selected':'selected'}).contents[0]
595                         fprocessor = tr.find('select', {'name':'processor_id'}).find('option', {'selected':'selected'}).contents[0]
596                         fdate = trs[i+1].find('input', {'name':'release_time'})['value']
597                         r_files[fname]={'type':ftype,'processor':fprocessor,'date':fdate}
598                         i+=3
599                 
600                 r_id = re.search('release_id=([0-9]*)', rel_edit).group(1)
601                 if r_status == 'Hidden':
602                         self.switch_rel_status(rel_edit, r_name, 1, r_date, r_notes, r_change)
603                 r_files = self.pluck_files_for_release(r_id, r_files)
604                 if r_status == 'Hidden':
605                         self.switch_rel_status(rel_edit, r_name, 3, r_date, r_notes, r_change)
606                 result = {'date':r_date,'status':r_status,'release_notes':r_notes,'change_log':r_change,'files':r_files}
607                 return r_name, result
608                 
609         ###### TASKS PARSING : COCLICO NEW FEATURE
610         
611         class Task:
612                 ## CLASSE DES TASKS
613                 def __init__(self, label, parent, projectbase):
614                         self.parent = parent
615                         self.optional = False
616                         self.chunksize = 50
617                         self.zerostring = None
618                         self.label = label
619                         self.projectbase = projectbase
620                         self.name_mappings = {
621                         "bug_group_id": "group",
622                         "category_id": "category",
623                         "resolution_id": "resolution",
624                         "status_id": "status"
625                 }
626                         # l'auteur du bug n'est pas éditable et ne peut donc être récupérée par le crawler dans le formulaire, cette regexp ne sert qu'à lui
627                         self.submitter_re = '''<td><strong>Submitted by:</strong><br />\n[^(]*\(([^)]*)'''
628                         # la date n'est pas éditable et ne peut donc être récupérée par le crawler dans le formulaire
629                         self.date_re = '''()'''
630                         # vérifier les champs de formulaire à ignorer
631                         self.ignore = ("canned_response",
632                         "new_artifact_type_id",
633                         "words", "type_of_search", "start_month", "end_month")
634                         # identifie les ids d'artefacts (bug, patches...) : aid
635
636                         self.artifactid_re = r'/pm/task.php\?func=detailtask&amp;project_task_id=([0-9]+)&amp;group_id=[0-9]*&amp;group_project_id=[0-9]*'
637                         
638
639                 def access_denied(self, page, issue_id=None):
640                         "Vérifie si l'utilisateur n'a pas un accès en édition au tracker interrogé (il doit être au minimum tracker admin)"
641                         if "No Matching Tasks found" in page:
642                                 return 0
643                         else:
644                                 return issue_id is None and not "Mass Update" in page
645
646                 def has_next_page(self, page):
647                         """
648                         Vérifie si la page contient un compteur indiquant qu'il existe une autre page dans la liste d'artefacts affichés.
649                         A vérifier avec 50 bugs ou avec les données réelles.
650                         """
651                         return "Next &raquo;" in page
652
653                 def chunkfetcher(self, offset):
654                         "Get a bugtracker index page - all bug IDs, open and closed.."
655                         return self.projectbase + "&offset=%d&limit=100" % offset
656
657                 #fonctionne avec func=browse&func=detail&aid=X (rend le detail de X), mais très laid...
658                 def detailfetcher(self, issueid):
659                         "Generate a bug detail URL for the specified bug ID."
660                         return self.projectbase + '&func=detailtask&project_task_id=' + str(issueid)
661
662                 #TODO
663                 def narrow(self, text):
664                         "Get the section of text containing editable elements."
665                         #Look for <div id="maindiv">
666                         return text
667         
668                 def parse_followups(self, contents):
669                         '''
670                         Parse followups out of a displayed page in a task tracker.
671                         @param contents: Web page of the artifact
672                         '''
673                         comments = []
674                         try:
675                                 trs = contents.find('h3',text='Followups').findNext('table').findAll('tr')[1:]
676                                 for tr in trs:  
677                                         comment = {"class":"COMMENT"} #Useful?
678                                         tds = tr.findAll('td')
679                                         comment['comment'] = dehtmlize(''.join(map(str,tds[0].contents)))
680                                         comment['date'] = self.parent.canonicalize_date(tds[1].contents[0])
681                                         comment['submitter'] = tds[2].contents[0]
682                                         comments.append(comment)
683                                 comments.reverse()
684                         except AttributeError:
685                                 #No registered followup
686                                 pass
687                         return comments
688                 
689                 def parse_history(self, contents):
690                         '''
691                         Get the history of changes of the current task
692                         @param contents: Web page of the artifact
693                         '''
694                         history=[]
695                         try:
696                                 trs = contents.find('h3',text='Task Change History').findNext('table').findAll('tr')[1:]
697                                 for tr in trs:
698                                         h = {"class":"HISTORY"}
699                                         tds = tr.findAll('td')
700                                         h['field'] = tds[0].contents[0]
701                                         h['old'] = tds[1].contents[0]
702                                         h['date'] = self.parent.canonicalize_date(tds[2].contents[0])
703                                         h['by'] = tds[3].contents[0]
704                                         history.append(h)
705                                 history.reverse()
706                         except AttributeError:
707                                 #No registered history
708                                 pass
709                         return history
710                 
711                 def parse_linkedTasks(self, contents):
712                         '''
713                         Get the linked, required tasks of the current task
714                         @param contents: Web page of the artifact
715                         '''
716                         linked = []
717                         try:
718                                 trs = contents.find('h3',text='Tasks That Depend on This Task').findNext('table').findAll('tr')[1:]
719                                 for tr in trs:
720                                         l = {"class":"LINKED_ARTIFACT"}
721                                         tds = tr.findAll('td')
722                                         l['task_id'] = tds[0].contents[0].next
723                                         l['task_summary'] = tds[1].contents[0]
724                                         linked.append(l)
725                                 linked.reverse()
726                         except AttributeError:
727                                 #No registered linked task
728                                 pass
729                         return linked
730                         
731                 def custom(self, contents, artifact, vocabularies):
732                         '''
733                         Pluck the specificic properties of this plucker.
734                         @param contents: Web page of the artifact
735                         @param artifact: Artifact array as plucked by the basic extractor
736                         @param vocabularies: Vocabularies array as plucked by the basic extractor
737                         '''
738                         soupedContents = BeautifulSoup(contents)
739 #                       Get comments
740                         artifact['comments'] = self.parse_followups(soupedContents)
741 #                       Get history
742                         artifact['history'] = self.parse_history(soupedContents)
743 #                       Get linked tasks
744                         artifact['linked_tasks'] = self.parse_linkedTasks(soupedContents)
745 #                       Get the first comment of the task which cannot be modified and is the description of the task
746                         artifact['description'] = soupedContents.find('td', attrs={'colspan':'2'}).find('strong').nextSibling.nextSibling.strip() #Use colspan because the original comment + add comment box are always in a colspan 2 td with no id
747 #                       Gather the number of the projected end moth of the task
748                         end_month = soupedContents.find('select',attrs={'name':'end_month'}).find('option', attrs={'selected':'selected'})['value'] #Maybe use a corresponding tab, Plucker give the content value, the name of the month (January, Feb..., etc) instead of the number
749 #                       Format the end date as a normal date as used for comment or followups date
750                         end_date = artifact['end_year']+'-'+end_month+'-'+artifact['end_day']+' '+artifact['end_hour']+':'+artifact['end_minute']
751 #                       Delete unused categories in the artifact and vocabularies. Could have been filtered from the start but would require more search in the soupedContents, unsure which way is the fastest
752                         del artifact['end_year'],artifact['end_day'],artifact['end_hour'],artifact['end_minute']
753                         del vocabularies['end_year'],vocabularies['end_day'],vocabularies['end_hour'],vocabularies['end_minute']
754 #                       Register the date in the artifact as end_date
755                         artifact['end_date'] = self.parent.canonicalize_date(end_date)
756 #                       Same for start date
757                         start_month = soupedContents.find('select',attrs={'name':'start_month'}).find('option', attrs={'selected':'selected'})['value']
758                         start_date = artifact['start_year']+'-'+start_month+'-'+artifact['start_day']+' '+artifact['start_hour']+':'+artifact['start_minute']
759                         del artifact['start_year'],artifact['start_day'],artifact['start_hour'],artifact['start_minute']
760                         del vocabularies['start_year'],vocabularies['start_day'],vocabularies['start_hour'],vocabularies['start_minute']
761                         artifact['start_date'] = self.parent.canonicalize_date(start_date)
762                         artifact['hours'] = soupedContents.find('input', attrs={'name':'hours'})['value']
763                         try:
764                                 pct = soupedContents.find('select', attrs={'name':'percent_complete'}).find('option', attrs={'selected':'selected'})['value']
765                         except TypeError:
766                                 pct = 0
767                         artifact['percent_complete'] = pct
768
769                         return True
770                 
771         class CustomizableTaskTracker(Task):
772                 '''
773                 Basic TaskTrackerObject
774                 '''
775                 def __init__(self, parent, nameTracker, typeTracker, projectbase):
776                         FusionForge.Task.__init__(self, nameTracker, parent, projectbase)
777                         self.type = typeTracker
778                 
779         def getTasksTrackers(self):     
780                 '''
781                 Parse the tasks initial page and return a list of dictionaries containing (tasks tracker name, tasks tracker type (default custom)).
782                 If type corresponds to one of the fixated types, register it
783                 '''
784                 tasksTrackers = []
785                 basepage = BeautifulSoup(self.basepage)
786                 tT = basepage.findAll('a', {'href':re.compile('task.php')})
787                 for t in tT:
788                         tPage = re.search('[^/]*//[^/]*/([^"]*)',t['href']).group(1)
789                         tLabel = t.contents[0]
790                         tasksTrackers.append({'label':tLabel, 'type':'custom', 'projectbase':tPage})
791                 return tasksTrackers
792         
793         def pluck_tasksTrackers(self, timeless=False):
794                 '''
795                 Initializes tasks plucking. This is the only method which should be called for tasks plucking. 
796                 Return the corresponding data for each tasks tracker of the project.
797                 @param timeless:Needed to register interval between beginning and end of scrapping
798                 '''
799                 self.trackers = [] #Reset trackers to empty
800                 for tasksTracker in self.getTasksTrackers():
801                         self.trackers.append(FusionForge.CustomizableTaskTracker(self, tasksTracker['label'], tasksTracker['type'], tasksTracker['projectbase']))
802                 return self.pluck_trackers(timeless, True)
803
804         ###### NEWS PARSING : COCLICO NEW FEATURE
805         
806         def pluck_news(self):
807                 '''
808                 Initializes the extraction of the news of a project. This is the only method which should be called from outside. 
809                 '''
810                 init_page = self.fetch('news/?group_id='+self.project_id, 'plucking main news page')    
811                 soup = BeautifulSoup(init_page)
812                 result = self.newsListParser(soup)
813                 return result
814         
815         
816         def newsListParser(self, soup):
817                 '''
818                 Parse the list of news of the news list page
819                 @param soup: the souped html of the news' list page
820                 '''
821                 newsList = []
822                 links = soup.findAll('a',{'href':re.compile('forum/forum.php')})
823                 i = 0
824                 while i<len(links):
825                         newsList.append(links[i]['href'])
826                         if i<20: #nombre de news affichées en full
827                                 i+=2
828                         else:
829                                 i+=1
830                 result = []
831                 for news in newsList:
832                         result.append(self.newsParser(BeautifulSoup(self.fetch(news, 'plucking news'))))
833                 return result
834
835         def newsParser(self, newsSoup):
836                 '''
837                 Parse a single news page
838                 @param news: a news page soup
839                 '''
840                 newsTable = newsSoup.find('table')
841                 poster_name = newsTable.find(text=re.compile('Posted')).next.strip()
842                 news_date = newsTable.find(text=re.compile('Date')).next.strip()
843                 news_summary = newsTable.find(text=re.compile('Summary')).next.contents[0].encode('utf-8')
844                 news_content=''
845                 for s in newsTable.find('p').contents:
846                         if s.string != None:
847                                 news_content+=s.string
848                 news_content = news_content.strip().replace('\r','\n')
849                 news_content = news_content.encode('utf-8')
850 #               news_content = dehtmlize(''.join(blocktext(str(s)) for s in newsTable.find('p').contents)) #Couldn't make a correct code to handle both the <br /> and the \n without either too many \n or none at all... replaced by the length above code
851                 news_forum = self.forumParser(newsSoup, 5, 4, 3)
852                 result = {'poster_name':poster_name, 'date':news_date,'summary':news_summary,'news_content':news_content,'forum':news_forum}
853                 return result
854                 
855                 
856         def mailingListsListing(self, soup):
857                 mailinglists = []
858                 for (archive_html, description_html, listinfo_html) in self.table_iter(str(soup), '<table', 3, 'Mailing Lists Table', has_header=True, keep_html=True):
859                         a = listinfo_html.find('a')
860                         fHref = a['href']
861                         mailinglists.append(fHref)
862                 return mailinglists
863                 
864         def taskTrackersListing(self, soup):
865                 task_trackers = []
866                 trs = soup.find('table').findAll('tr')[1:]
867                 for tr in trs:
868                         tds = tr.findAll('td')
869                         fHref = tds[0].find('a')['href']
870                         task_trackers.append(fHref)
871                 return task_trackers
872
873         def scmListing(self, soup):
874                 scm = scm_type = None
875                 tts = soup.findAll('tt')
876                 for tt in tts:
877                         m = re.search('([^ ]+) checkout .* (http.*)', tt.contents[0])
878                         if m:
879                                 scm_type = m.group(1)
880                                 scm = m.group(2)
881                                 break
882                 return scm_type, scm
883
884         ###### FORUM PARSING : COCLICO NEW FEATURE
885         
886         def pluck_forums(self):
887                 '''
888                 Initializes forums parsing. This is the method which should be called from the outside
889                 '''
890                 if self.verbosity >= 1:
891                         self.notify('plucking forums from forum/?group_id='+self.project_id)
892                 init_page = self.fetch('forum/?group_id='+self.project_id, 'plucking main forum page')
893                 soup = BeautifulSoup(init_page)
894                 result = self.forumsParser(soup)
895                 return result
896         
897         class Message():
898                 '''
899                 When parsing a forum, the method creates a Message object for each message found. This object provides methods to retrieve the informations such as content, linked file ... of the initialized message
900                 '''
901                 def __init__(self, parent, href, index):
902                         '''
903                         Intializes the Message object and retrieves information using the retrieveInfo method
904                         @param parent: Fusionforge class, needed to get fetch method access
905                         @param href: Message url
906                         @param index: index of the message table in the list of tables of the page
907                         '''
908                         self.parent = parent
909                         self.href = href
910                         self.index = index
911                         self.messagesObjects = []
912                         self.infos = self.retrieveInfo()
913                         
914                 def retrieveInfo(self):
915                         '''
916                         Retrieve the info of the message : submitter, date, attached file, subject, content
917                         '''
918                         message = self.parent.fetch(self.href, 'Retrieving message info for msg_id '+self.href)
919                         message = BeautifulSoup(message)
920                         table = message.findAll('table')[self.index]
921                         submitter_login = table.find('a', {'href':re.compile('/users/')}).contents[0]
922                         date = table.find(text=re.compile('DATE:'))[6:]
923                         if table.find(text=re.compile('No attachment')) != None:
924                                 #No attachment
925                                 attachment = {}
926                         else:
927                                 #attachment found
928                                 fhref = table.find('a',{'href':re.compile('javascript:manageattachments')})
929                                 fname = table.find('a',{'href':re.compile('javascript:manageattachments')}).contents[1].encode('utf-8')
930                                 furl = re.search(":manageattachments\('([^']*)", fhref['href']).group(1)
931                                 rep_dest = self.parent.project_name + '/forum'
932                                 if not os.path.exists(self.parent.project_name):
933                                         os.mkdir(self.parent.project_name)
934                                 if not os.path.exists(rep_dest):
935                                         os.mkdir(rep_dest)
936                                 dl = self.parent.fetch(furl, 'forum attachment file fetching')
937                                 fnout = rep_dest + '/' + fname
938                                 fout = open(fnout, "wb")
939                                 fout.write(dl)
940                                 fout.close()
941                                 attachment = {'name':fname, 'url':fnout}
942                         subject = (table.find(text=re.compile('SUBJECT:'))[9:]).encode('utf-8')
943                         content = re.search('<p>&nbsp;</p>(.*)</td></tr></table>',str(table),re.DOTALL).group(1)
944                         content = content.decode('iso-8859-1')
945                         content = content.encode('utf-8')
946                         
947                         return {'submitter':submitter_login, 'date':date, 'attachment':attachment, 'subject':subject, 'content':content}
948         
949                 def addChild(self, message):
950                         '''
951                         Add a message to the current message children
952                         @param message: an object Message
953                         '''
954                         self.messagesObjects.append(message)
955                         
956                 def toDict(self):
957                         '''
958                         Used to extract a dictionary recursively of the content of this message and of each of its children. Called once on each level 0 message.
959                         '''
960                         messagesList = []
961                         for message in self.messagesObjects:
962                                 messagesList.append(message.toDict())
963                         self.infos['children']=messagesList
964                         return self.infos
965         
966         def forumParser(self, soup, nextTableIndex = 2, dataTableIndex = 1, messageTableIndex = 0):
967                 '''
968                 Parse the content of the forum
969                 @param soup: soupedContent of the forum page to parse
970                 @param nextTableIndex: Index of the table containing the 'Previous Page'/'Next Page' in the whole page
971                 @param dataTableIndex: Index of the table containing the messages in the whole page
972                 @param messageTableIndex: Index of the table containing the message content when viewing a single message rather than the whole forum
973                 '''
974                 #Will store ids of each message parsed, useful because Threaded View may show the same thread and messages on different pages and parser may store them multiple times
975                 liste_ids = []
976                 #Boolean used to allow multiple forum pages parsing
977                 nextPage = True
978                 #Stores each message at depth 0, meaning each root of a new thread
979                 listeMessagesObjects = []
980                 #Stores each message parsed in a {msg id:msg object} dictionary, useful to add children to message parsed on a different page. It sometimes happen that we reset the root of a thread then discover another children of this thread on a subsequent page
981                 dictAllMessagesObjects = {}
982                 #Stores the current thread from the root
983                 liste = []
984                 #Stores the last message seen in a {depth:msg object} dictionary, even if the message has already been seen
985                 lastMsgAtDepth = {}
986                 while nextPage:
987                         
988                         tables = soup.findAll('table')
989                         nextTable = tables[nextTableIndex]
990                         dataTable = tables[dataTableIndex]
991                         trs = dataTable.findAll('tr')[1:]
992
993                         for tr in trs:
994                                 href = tr.find('a')['href']
995                                 #check each time?
996                                 #print href
997                                 msg_id = re.search('msg_id=([0-9]*)', href).group(1)
998                                 depth = (len(str(tr.contents[0]).split(';'))-1)/3
999                                 #Si l'id du message n'a pas déjà été parcourue:
1000                                 if not msg_id in liste_ids:
1001
1002                                         liste_ids.append(msg_id)
1003                                         
1004                                         
1005                                         if depth == 0:
1006                                                 try:
1007                                                         listeMessagesObjects.append(liste[0])
1008                                                 except:
1009                                                         pass
1010                                                 msg = self.Message(self, href, messageTableIndex)
1011                                                 dictAllMessagesObjects[msg_id] = msg
1012                                                 try:
1013                                                         liste[0] = msg
1014                                                 except:
1015                                                         liste.append(msg) 
1016                                         else:
1017                                                 msg = self.Message(self, href, messageTableIndex)
1018                                                 dictAllMessagesObjects[msg_id] = msg
1019                                                 try:
1020                                                         liste[depth-1].addChild(msg)
1021                                                         try:
1022                                                                 liste[depth] = msg
1023                                                         except:
1024                                                                 liste.append(msg)
1025                                                 except:
1026                                                         lastMsgAtDepth[depth-1].addChild(msg)
1027                                                         lastMsgAtDepth[depth] = msg
1028                                 #Si l'id du message a déjà été parsée, reset de la liste
1029                                 else:
1030                                         lastMsgAtDepth[depth] = dictAllMessagesObjects[msg_id]
1031                                         liste = []
1032                                         
1033                         if len(liste)>0:
1034                                 listeMessagesObjects.append(liste[0])
1035                         
1036                         hasNextPage = nextTable.find(text=' Next Messages')
1037                         if hasNextPage == None:
1038                                 nextPage = False
1039                         else:
1040                                 nextPageHtml = self.fetch(hasNextPage.findPrevious('a')['href'], 'Fetching next forum page')
1041                                 soup = BeautifulSoup(nextPageHtml)
1042                 
1043                 listeMessagesDicts = []
1044                 for message in listeMessagesObjects:
1045                         listeMessagesDicts.append(message.toDict())     
1046                 return listeMessagesDicts
1047
1048         def forumAdminParser(self, soup):
1049                 '''
1050                 Parse the content of the admin page of the forum
1051                 @param soup:
1052                 '''
1053                 anon_posts = soup.find('input', {'name':'allow_anonymous', 'checked':'checked'}).next.strip()
1054                 is_public = soup.find('input', {'name':'is_public', 'checked':'checked'}).next.strip()
1055                 moderation = soup.find('select',{'name':'moderation_level'}).find('option',{'selected':'selected'}).contents[0]
1056                 email_posts_to = soup.find('input',{'name':'send_all_posts_to'})['value']
1057                 dictParams = {'allow_anonymous_posts':anon_posts,'is_public':is_public, 'moderation_level':moderation, 'email_posts_to':email_posts_to}
1058                 return dictParams 
1059         
1060         def forumMonitorParser(self, soup):
1061                 '''
1062                 Parse the content of the monitoring users table
1063                 @param soup:
1064                 '''
1065                 listeUsers = []
1066                 trs = soup.find('table').findAll('tr')[1:]
1067                 for tr in trs:
1068                         listeUsers.append(tr.find('td').contents[0])
1069                 return listeUsers
1070         
1071         def forumSwitchViewMode(self, fId):
1072                 '''
1073                 Switch view mode to Threaded (only mode parsed)
1074                 @param fId: forum Id
1075                 '''
1076                 self.fetch('forum/forum.php?set=custom&forum_id='+str(fId)+'&style=threaded&max_rows=25&submit=Change+View', 'Updating '+str(fId) + ' view mode to Threaded')
1077                 return True
1078
1079         def forumsListing(self, soup):
1080                 forums = []
1081                 trs = soup.find('table').findAll('tr')[1:]
1082                 for tr in trs:
1083                         tds = tr.findAll('td')
1084                         fHref = tds[0].find('a')['href']
1085                         fId = re.search('forum_id=([0-9]*)', fHref).group(1)
1086                         fUrl = 'forum/'+fHref
1087                         fName = tds[0].find('a').contents[1][6:].encode('utf-8')
1088                         fDesc = tds[1].contents[0].encode('utf-8')
1089                         self.forumSwitchViewMode(fId)
1090                         fAdminUrl = 'forum/admin/index.php?group_id='+self.project_id+'&change_status=1&group_forum_id='+fId
1091                         fMonitorUrl = '/forum/admin/monitor.php?group_id='+self.project_id+'&group_forum_id='+fId
1092                         forums.append({'name':fName, 'description':fDesc, 'URL':fUrl, 'adminUrl':fAdminUrl, 'monitoring_usersUrl':fMonitorUrl})
1093                 return forums
1094                 
1095         def forumsParser(self, soup):
1096                 '''
1097                 Fetch a list of forums from the souped contents of the forums page, initializes the extraction ot the content of each forum and the admin parameters
1098                 @param soup: Souped contents of the forums page
1099                 '''
1100                 forums = []
1101                 frs = self.forumsListing(soup)
1102                 for f in frs:
1103                         fName = f['name']
1104                         fDesc = f['description']
1105                         fUrl = f['URL']
1106                         fAdminUrl = f['adminUrl']
1107                         fMonitorUrl = f['monitoring_usersUrl']
1108                         fId = re.search('forum_id=([0-9]*)', fUrl).group(1)
1109                         self.forumSwitchViewMode(fId)
1110                         fAdminContent = self.forumAdminParser(BeautifulSoup(self.fetch(fAdminUrl, 'forum admin content download. forum name:'+fName)))
1111                         fMonitorContent = self.forumMonitorParser(BeautifulSoup(self.fetch(fMonitorUrl, 'forum monitoring users content download. forum name:'+fName)))
1112                         fContent = self.forumParser(BeautifulSoup(self.fetch(fUrl, 'forum content downloading. forum name:'+fName))) 
1113                         forums.append({'name':fName, 'description':fDesc, 'content':fContent, 'admin':fAdminContent, 'monitoring_users':fMonitorContent})
1114                 return forums
1115                 
1116         
1117         ###### END FORUM PARSING
1118
1119         class ProjectDescription:
1120                 "Project description"
1121                 def __init__(self):
1122                         pass
1123 #               def extract(self, contents):
1124                         
1125                         
1126 ## FusionForge.__init__
1127
1128         def __init__(self, host, project_name, params = False):
1129                 """                     """
1130                 GenericForge.__init__(self, host, project_name, params)
1131                 
1132                 self.basepage = self.fetch(self.project_page(project_name),
1133                                 "Main page")
1134                 m = re.search(r'\?group_id=([0-9]*)', self.basepage)
1135
1136                 # #self.verbosity = 2
1137
1138                 # # some trackers may be private, so may not be displayed in project's homepage
1139                 # project_homepage = self.fetch(self.project_page(project_name),
1140                 #                 "Main page")
1141                 # #print project_homepage
1142                 # m = re.search(r'/tracker/\?group_id=([0-9]*)', project_homepage)
1143
1144                 if m:
1145                         #print m.group(0)
1146                         self.project_id = m.group(1)
1147                         #self.basepage = self.fetch('tracker/', "Trackers page", params={'group_id': self.project_id})
1148                         #print "basepage :",self.basepage
1149                 else:
1150                         raise ForgePluckerException("No matching id found for project %s" % project_name)
1151
1152
1153         # Overloaded to customize FF paths 'host/projects/unixname'
1154         def project_page(self, project):
1155                 "Computes project address"
1156                 return "projects/%s/" % (project,)
1157
1158         # def tracker_list_page(self):
1159         #       "Computes project address"
1160         #       return "tracker/?group_id=%s/" % group_id
1161
1162 ## DATE
1163
1164         @staticmethod
1165         def canonicalize_date(localdate):
1166                 "Canonicalize dates to ISO form. Assumes dates are in local time."
1167                 t = time.strptime(localdate, "%Y-%m-%d %H:%M")
1168                 secs = time.mktime(t)   # Local time to seconds since epoch
1169                 t = time.gmtime(secs)   # Seconds since epoch to UTC time in structure
1170                 return time.strftime("%Y-%m-%dT%H:%M:%SZ", t)
1171
1172 ## LOGIN
1173
1174         def login(self, username, password):
1175                 # POST data in the login form
1176                 GenericForge.login(self, 
1177                                    {
1178                                 'form_loginname':username,
1179                                 'form_pw':password,
1180                                 'login':'login'},
1181                                    'href="'+ self.real_url('account/logout.php') +'">')
1182
1183
1184         def narrow(self, text):
1185                 "Get the section of text containing editable elements."
1186                 soupedContents = BeautifulSoup(text)
1187                 text = soupedContents.find('div', id='gforge-content')
1188                 if not text:
1189                         text = soupedContents.find('div', id='maindiv')
1190                 return str(text)
1191
1192         def pluck_project_data(self):
1193                 project_page = self.project_page(self.project_name)
1194                 page = self.fetch(project_page, "Project summary")
1195                 mainsoup = BeautifulSoup(self.narrow(page))
1196                 
1197                 description = None
1198                 shortdesc = None
1199                 fieldset = mainsoup.find('fieldset')
1200                 # 4.8
1201                 if fieldset:
1202                         description = fieldset.find('table').find('tr').find('td').find('p').contents
1203                         description = dehtmlize(''.join(map(str,description)))
1204                 else: #5.0
1205                         shortdesc = mainsoup.find('h2').contents[0]
1206                         description = mainsoup.find('p').contents[0]
1207
1208                 registered = None
1209                 for p in mainsoup.findAll('p'):
1210                         m = re.search('Registered:&nbsp;([-0-9: ]+)', str(p.contents[0]))
1211                         if m:
1212                                 registered = self.canonicalize_date(m.group(1))
1213                                 break
1214
1215                 homepage = None
1216                 trackers = None
1217                 public_forums = None
1218                 docman = None
1219                 mailing_lists = None
1220                 task_trackers = None
1221                 scm = None
1222                 news = None
1223                 frs = None
1224
1225                 public_areas = None
1226                 for t in mainsoup.findAll('table'):
1227                         tr = t.find('tr', attrs={'class': 'tableheading'})
1228                         if tr:
1229                                 td = tr.find('td').findNext('td').find('span').contents[0]
1230                                 if td == 'Public Areas' :
1231                                         public_areas = t
1232                                         break
1233                 if public_areas:
1234                         #print 'public_areas:', public_areas
1235                         t = public_areas.find('tr').findNext('tr').findNext('tr').find('td').find('table', attrs={'class': 'tablecontent'})
1236                         a = t.find('a')
1237                         while a:
1238                                 for l in a.contents:
1239                                         if l == '&nbsp;Project Home Page':
1240                                                 homepage = a['href']
1241                                         if l == '&nbsp;Tracker':
1242                                                 trackers = self.get_trackers()
1243                                         if l == '&nbsp;Public Forums':
1244                                                 init_page = self.fetch('forum/?group_id='+self.project_id, 'plucking main forum page')
1245                                                 soup = BeautifulSoup(self.narrow(init_page))
1246                                                 public_forums = self.forumsListing(soup)
1247                                         if l == '&nbsp;DocManager: Project Documentation':
1248                                                 docman = a['href']
1249                                         if l == '&nbsp;Mailing Lists':
1250                                                 init_page = self.fetch('mail/?group_id='+self.project_id, 'plucking main mailing lists page')
1251                                                 soup = BeautifulSoup(self.narrow(init_page))
1252                                                 mailing_lists = self.mailingListsListing(soup)
1253                                         if l == '&nbsp;Task Manager':
1254                                                 init_page = self.fetch('pm/?group_id='+self.project_id, 'plucking main tasks page')
1255                                                 soup = BeautifulSoup(self.narrow(init_page))
1256                                                 task_trackers = self.taskTrackersListing(soup)
1257                                         if l == '&nbsp;SCM Repository':
1258                                                 init_page = self.fetch('scm/?group_id='+self.project_id, 'plucking scm page')
1259                                                 soup = BeautifulSoup(self.narrow(init_page))
1260                                                 scm = self.scmListing(soup)
1261                                                 
1262                                 a = a.findNext('a')
1263
1264                         a = public_areas.find('a')
1265                         while a:
1266                                 for l in a.contents:
1267                                         if l == '[News archive]':
1268                                                 news = a['href']
1269                                                 break
1270                                 a = a.findNext('a')
1271
1272                 for a in mainsoup.findAll('a'):
1273                         for l in a.contents:
1274                                 if l == '[View All Project Files]':
1275                                         frs = a['href']
1276                                         break
1277                         if frs:
1278                                 break
1279
1280                 project_url = self.real_url(project_page)
1281                 data = {"class":"PROJECT",
1282                         "forgetype":self.__class__.__name__,
1283                         "host" : self.host,
1284                         "project" : self.project_name,
1285                         "description" : description,
1286                         "registered" : registered,
1287                         "homepage": homepage,
1288                         "URL": project_url,
1289                         "format_version": 1 }
1290
1291                 forge = self.real_url('')
1292
1293                 tools = {}
1294
1295                 if shortdesc:
1296                         data['shortdesc'] = shortdesc
1297
1298                 if trackers:
1299                         data['trackers_list'] = []
1300                         for t in trackers:
1301                                 url = self.real_url(t.getUrl())
1302                                 data['trackers_list'].append(url)
1303                                 provided_by = forge+'#tracker'
1304                                 tools[url] = { 'provided_by': provided_by }
1305                                 if not provided_by in tools:
1306                                         tools[provided_by] = { 'type': 'TrackersTool',
1307                                                                'name': 'http://fusionforge.org/tool/trackers'}
1308
1309                 if public_forums:
1310                         data['public_forums'] = []
1311                         for f in public_forums:
1312                                 url = self.real_url(f['URL'])
1313                                 data['public_forums'].append(url)
1314                                 provided_by = forge+'#forum'
1315                                 tools[url] = { 'provided_by': provided_by }
1316                                 if not provided_by in tools:
1317                                         tools[provided_by] = { 'type': 'ForumsTool',
1318                                                                'name': 'http://fusionforge.org/tool/forums'}
1319                                 
1320                 if docman:
1321                         data['docman'] = docman
1322                         provided_by = forge+'#docman'
1323                         tools[docman] = { 'provided_by': provided_by }
1324                         if not provided_by in tools:
1325                                 tools[provided_by] = { 'type': 'DocumentsTool',
1326                                                                'name': 'http://fusionforge.org/tool/docman'}
1327
1328                 if mailing_lists:
1329                         data['mailing_lists'] = mailing_lists
1330                         for m in mailing_lists:
1331                                 provided_by = forge+'#mailman'
1332                                 tools[m] = { 'provided_by': provided_by }
1333                                 if not provided_by in tools:
1334                                         tools[provided_by] = { 'type': 'MailingListTool',
1335                                                                'name': 'mailman'}
1336
1337                 if task_trackers:
1338                         data['task_trackers'] = task_trackers
1339                         for t in task_trackers:
1340                                 provided_by = forge+'#taskstracker'
1341                                 tools[t] = { 'provided_by': provided_by }
1342                                 if not provided_by in tools:
1343                                         tools[provided_by] = { 'type': 'TaskTool',
1344                                                                'name': 'http://fusionforge.org/tool/tasks/'}
1345
1346                 if scm:
1347                         scm_type, scm = scm
1348                         data['scm_type'] = scm_type
1349                         data['scm'] = scm
1350                         provided_by = forge+'#'+scm_type
1351                         tools[scm] = { 'provided_by': provided_by }
1352                         if not provided_by in tools:
1353                                 if scm_type == 'svn':
1354                                         tools[provided_by] = { 'type': 'SvnScmTool',
1355                                                                'name': 'svn'}
1356                                 else:
1357                                         tools[provided_by] = { 'type': 'ScmTool'}
1358
1359
1360                 if news:
1361                         data['news'] = news
1362                         provided_by = forge+'#news'
1363                         tools[news] = { 'provided_by': provided_by }
1364                         if not provided_by in tools:
1365                                 tools[provided_by] = { 'type': 'NewsTool',
1366                                                        'name': 'http://fusionforge.org/tool/news/'}
1367
1368                 if frs:
1369                         data['frs'] = frs
1370                         provided_by = forge+'#frs'
1371                         tools[frs] = { 'provided_by': provided_by }
1372                         if not provided_by in tools:
1373                                 tools[provided_by] = { 'type': 'FilesReleasesTool',
1374                                                        'name': 'http://fusionforge.org/tool/frs/'}
1375
1376                 data['tools'] = tools
1377
1378                 return data