python处理文本

想处理一个文本,输出特定格式化的文本 初学python不知道错误在哪里

输入文本的形式是这样的
<DOC>
<DOCNO> WS880212-0001 </DOCNO>
<FILEID>AP-NR-02-12-88 2344EST</FILEID>
<FIRST>u i AM-Vietnam-Amnesty 02-12 0398</FIRST>
<SECOND>AM-Vietnam-Amnesty,0411</SECOND>
<HEAD>Reports Former Saigon Officials Released from Re-education Camp</HEAD>
<DATELINE>BANGKOK, Thailand (AP) </DATELINE>
<TEXT>
  More than..........
</TEXT>
</DOC>
希望提取其中的docno 和text 希望输出的文本格式是这样的
<DOC>
<DOCNO> 51 </DOCNO>//就是原文本中的DOCNO
Airbus Subsidies//文本中text的内容 
</DOC>

程序有一个参数可以指定输出的内容为text 或者title或者其他标签的内容
以下为代码

import sys
import re
import os
from sgmllib import SGMLParser

class QueryParser(SGMLParser):
  def clr(self): 
  self.inDOCNO, self.inDesc, self.inNarr, self.inTitle, self.inText = 0,0,0,0,0

  def reset(self):
  SGMLParser.reset(self)
  # self.feq = None
  self.clr()

  def unknown_starttag(self, tag, attrs):
  self.clr()
   
  def start_top(self, attrs):
  fout.write("<DOC>\n")
   
  def end_top(self):
  # try:
  # self.feq.close()
  # except:
  # pass
  fout.write("</DOC>\n")

  def start_DOCNO(self, attrs): self.clr(); self.inDOCNO = 1
  def start_title(self, attrs): self.clr(); self.inTitle = 1
  def start_desc(self, attrs): self.clr(); self.inDesc = 1
  def start_narr(self, attrs): self.clr(); self.inNarr = 1
  def start_text(self, attrs): self.clr(); self.inText = 1

   
  def handle_data(self, text):
  text = text.split()[1:]
  if self.inDOCNO:
  fout.write("<DOCNO> %d </DOCNO>\n"% int(text[0]))
  # self.feq = open(os.path.join('queries', 'query%d.txt' % int(text[0])), 'w')
  if (self.inTitle and flgTitle) or (self.inDesc and flgDesc) or (self.inNarr and flgNarr) or (self.inText and flgText):
  fout.write(" ".join(text))
  fout.write('\n')
  # self.feq.write(" ".join(text))
  # self.feq.write('\n')
   
def process(filename):
  print 'Process %s' % filename
  fp = open(filename, 'r')
  parser = QueryParser()
  parser.feed(fp.read())
  fp.close()
  parser.close()

def usage():
  print "Options: [-title] [-desc] [-narr] [-text]topics_in_trec_format_file"
  print "Translate the query file from origional TREC format to the format which is acceptable by ParseToFile in Lemur"
  # print "And output each query in separate file in the format of my trecMiner.exe"

if len(sys.argv) <= 1:
  usage()
  sys.exit()

# os.mkdir('queries')
fout = open('D:\LAB\PureOrder\ori_query.txt', 'w')
print 'create file success'
flgTitle, flgNarr, flgDesc = 0, 0, 0
if __name__ == '__main__':
  for arg in sys.argv[1:]:
  if arg == '-title': flgTitle = 1
  elif arg == '-desc': flgDesc = 1
  elif arg == '-narr': flgNarr = 1
  elif arg == '-text': flgText = 1
  for arg in sys.argv[1:]:
  if arg[0] != '-':
  process(arg)
fout.close()

刚看了两天python 请大神指点

作者: hallow1987   发布时间: 2011-05-12

上面那个东西是个xml吧...用python的xml模块看看

作者: infidel   发布时间: 2011-05-12

什么错误呢?

作者: LongBless   发布时间: 2011-05-12