Python解析baidu的mp3列表
- 命令行下运行wget -i _baidump3url.txt
- 运行_baidump3.py> baidump3.txt
- 运行del *.htm?
将获得baidu所有歌曲的名字列表
_baidump3url.txt的内容:
http://list.mp3.baidu.com/song/A.htm
http://list.mp3.baidu.com/song/B.htm
http://list.mp3.baidu.com/song/C.htm
http://list.mp3.baidu.com/song/D.htm
http://list.mp3.baidu.com/song/E.htm
http://list.mp3.baidu.com/song/F.htm
http://list.mp3.baidu.com/song/G.htm
http://list.mp3.baidu.com/song/H.htm
http://list.mp3.baidu.com/song/J.htm
http://list.mp3.baidu.com/song/K.htm
http://list.mp3.baidu.com/song/L.htm
http://list.mp3.baidu.com/song/M.htm
http://list.mp3.baidu.com/song/N.htm
http://list.mp3.baidu.com/song/O.htm
http://list.mp3.baidu.com/song/P.htm
http://list.mp3.baidu.com/song/Q.htm
http://list.mp3.baidu.com/song/R.htm
http://list.mp3.baidu.com/song/S.htm
http://list.mp3.baidu.com/song/T.htm
http://list.mp3.baidu.com/song/W.htm
http://list.mp3.baidu.com/song/X.htm
http://list.mp3.baidu.com/song/Y.htm
http://list.mp3.baidu.com/song/Z.htm
_baidump3.py:
#!/usr/bin/python import urllib import string import re def GetContent (url): try: URLFile=urllib.urlopen(url) except IOError: print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n" else: HTMLText=URLFile.read() URLFile.close() return HTMLText if(__name__=="__main__"): file=open('_baidump3url.txt','r') fileread=file.read() urls=fileread.split('\n') queue=[] # regexp=re.compile(r'" target=_blank>(.*?)</[aA]></td>') for url in urls: #print url url=re.sub('http://list.mp3.baidu.com/song','.',url) #print url content=open(url,'r').read() lines=content.split('\n') for line in lines: #print line ccc=regexp.search(line) if(ccc): word=ccc.groups()[0] if word in queue: pass else: queue.append(word) #print url file.close() regexp1=re.compile(r'[- ](.*)') for w in queue: w = unicode(w,'cp936') w = w.encode('utf8') ccc=regexp1.search(w) if(ccc): w=ccc.groups()[0] #print w w=re.sub('《|》|,|\.|·|!','',w) if(''==w): continue print w





