您好,欢迎访问一九零五行业门户网

xml文件读写实例

这是一个目前在做的项目需要使用的xml文件读写实现。记起来以备后忘和供有需要的同学学习。
xml文件读写类: 
import java.io.file; import java.io.fileoutputstream; import java.io.ioexception; import java.util.arraylist; import java.util.hashmap; import java.util.iterator; import java.util.list; import java.util.map; import java.util.logging.level; import java.util.logging.logger; import org.jdom.document; import org.jdom.element; import org.jdom.jdomexception; import org.jdom.input.saxbuilder; import org.jdom.output.format; import org.jdom.output.xmloutputter; import org.lt.cj.config.entities.configmodel; import org.lt.cj.config.entities.tmallconfigmodel; import org.lt.cj.core.seed; public class xmlconfigwriter { /*创建淘宝商城的配置文件*/ public document buildupmalldocument(tmallconfigmodel missionconfig) throws missionconfigexception, enterurlsexception { if (missionconfig == null) { throw new missionconfigexception(); } else if (missionconfig.getseeds().isempty()) { return null; } // create the root element element rootelement = new element("website"); /* 设置网站属性 */ /* 设置网站名称 */ rootelement.setattribute("name", missionconfig.getwebsitename()); /*设置网站地址*/ rootelement.setattribute("url", missionconfig.getwebsiteurl()); //添加任务名称 element taskelement = new element("taskname"); taskelement.addcontent(missionconfig.gettaskname()); rootelement.addcontent(taskelement); //构造种子列表节点 element seeds = new element("seeds"); for (int i = 0; i < missionconfig.getseeds().size(); i++) { element seedelement = new element("seed"); element seednameelement = new element("seedname"); seednameelement.addcontent(missionconfig.getseeds().get(i).getseedname()); element seedurlelement = new element("seedurl"); seedurlelement.addcontent(missionconfig.getseeds().get(i).geturl()); element seedsortnameelement = new element("sortname"); seedsortnameelement.addcontent(missionconfig.getseeds().get(i).getsortname()); seedelement.addcontent(seedsortnameelement); seedelement.addcontent(seednameelement); seedelement.addcontent(seedurlelement); seeds.addcontent(seedelement); } rootelement.addcontent(seeds); //定义匹配的要采集的url链接fiturl的节点 element fiturls = new element("fiturls"); for (int i = 0; i < missionconfig.getfiturlregs().size(); i++) { element fiturl = new element("fit_url"); fiturl.addcontent(missionconfig.getfiturlregs().get(i)); fiturls.addcontent(fiturl); } rootelement.addcontent(fiturls);//添加到根节点 //并发工作线程数 element workingthreadselement = new element("workingthreads"); workingthreadselement.addcontent("" + missionconfig.getworkingthreads()); rootelement.addcontent(workingthreadselement);//添加到根节点 //定义页面编码节点 element pageencodingelement = new element("pageencoding"); pageencodingelement.addcontent(missionconfig.getpageencoding()); rootelement.addcontent(pageencodingelement);//添加到根节点 //定义下载图片控制标志节点 element dwdphoflagelement = new element("dwdphoflag"); dwdphoflagelement.addcontent(missionconfig.getdwdphoflag()); rootelement.addcontent(dwdphoflagelement); //定义原语言节点 element orilan = new element("orien_lan"); orilan.addcontent(missionconfig.getoriglanguage()); element translan = new element("trans_lan"); translan.addcontent(missionconfig.gettranlanguage()); rootelement.addcontent(orilan);//添加到根节点 rootelement.addcontent(translan);//添加到根节点 //定义匹配抓取信息的产品页面url节点 element pageurlregs = new element("pageurlregs"); for (int i = 0; i < missionconfig.getpagereg().size(); i++) { element pageurl = new element("pageurl"); pageurl.addcontent(missionconfig.getfiturlregs().get(i)); pageurlregs.addcontent(pageurl); } rootelement.addcontent(pageurlregs);//添加到根节点 map<string, list<string>> map = missionconfig.getentityreg(); list<string> list = null; element pathelements = new element("pathelements"); //直接循环算啦 //===================================== iterator iter = map.entryset().iterator(); while (iter.hasnext()) { map.entry e = (map.entry) iter.next(); element element = new element(e.getkey() + ""); map = missionconfig.getentityreg(); list = map.get(e.getkey() + ""); for (int i = 0; i < list.size(); i++) { element path = new element("path"); path.addcontent(list.get(i)); element.addcontent(path); } pathelements.addcontent(element); } rootelement.addcontent(pathelements); /* ===================================================== */ document mydocument = new document(rootelement); return mydocument; } /* 创建文档文件 */ public void createconfigfile(document document, string filepath) { try { /* 定义xml输出器 */ xmloutputter xmloutputter = new xmloutputter(); xmloutputter.setformat(format.getprettyformat()); file file = new file(filepath); if (!file.exists()) { if (file.createnewfile()) { fileoutputstream fileoutputstream = new fileoutputstream(filepath); xmloutputter.output(document, fileoutputstream); return; } } fileoutputstream fileoutputstream = new fileoutputstream(filepath); xmloutputter.output(document, fileoutputstream); } catch (java.io.ioexception e) { e.printstacktrace(); } } /* 重写文件 */ public void savetask(string filepath, configmodel configmodel) { try { tmallconfigmodel tmallconfigmodel = (tmallconfigmodel) configmodel; document document = buildupmalldocument(tmallconfigmodel); if (document != null) { createconfigfile(document, filepath); } } catch (missionconfigexception ex) { logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex); } catch (enterurlsexception ex) { logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex); } } //* xml文件读取方法 */ public tmallconfigmodel readmalldocument(string filepath) { tmallconfigmodel model = new tmallconfigmodel(); saxbuilder sb = new saxbuilder(); try { //读取基本配置信息 document doc = sb.build(filepath); //构造文档对象 element root = doc.getrootelement(); //获取根元素 string websitename = root.getattributevalue("name"); //获取网站名称 string websiteaddr = root.getattributevalue("url"); //获取网站地址 model.setwebsitename(websitename); //设置网站名称 model.setwebsiteurl(websiteaddr); //设置网站地址 element tasknameelement = root.getchild("taskname"); //获取任务名内容 string taskname = tasknameelement.gettext(); model.settaskname(taskname); //获取入口种子列表 list<seed> seedlist = new arraylist(); element seedselement = root.getchild("seeds"); list list = seedselement.getchildren(); for (int i = 0; i < list.size(); i++) { element element = (element) seedselement.getchildren().get(i); seed seed = new seed(); element seednameelement = element.getchild("seedname"); element seedurlelement = element.getchild("seedurl"); element seedsortnameelement = element.getchild("sortname"); seed.setseedname(seednameelement.gettexttrim()); seed.seturl(seedurlelement.gettexttrim()); seed.setsortname(seedsortnameelement.gettexttrim()); element parentseedelement = element.getchild("parentseed"); if (parentseedelement != null) { seed parentseed = new seed(); element parentseednameelement = parentseedelement.getchild("seedname"); element parentseedurlelement = parentseedelement.getchild("seedurl"); element parentseedsortnameelement = parentseedelement.getchild("sortname"); parentseed.setseedname(parentseednameelement.gettext()); parentseed.seturl(parentseedurlelement.gettexttrim()); parentseed.setsortname(parentseedsortnameelement.gettexttrim()); } seedlist.add(seed); } model.setseeds(seedlist); //获取匹配的要抽取的页面的特定部分内容 list = new arraylist(); element extracthtmlelement = root.getchild("extracthtml"); if (extracthtmlelement != null) { for (int i = 0; i < extracthtmlelement.getchildren().size(); i++) { element element = (element) extracthtmlelement.getchildren().get(i); list.add(element.gettext()); } } model.setextracthtmlreg(list); //获取匹配urls list = new arraylist(); element fiturlselement = root.getchild("fiturls"); for (int i = 0; i < fiturlselement.getchildren().size(); i++) { element element = (element) fiturlselement.getchildren().get(i); list.add(element.gettext()); } model.setfiturlregs(list); //获取线程数量 element workingthreadselement = root.getchild("workingthreads"); string workingcount = workingthreadselement.gettext(); model.setworkingthreads(integer.valueof(workingcount)); //获取解析编码 element pageencodingelement = root.getchild("pageencoding"); string pageencoding = pageencodingelement.gettext(); model.setpageencoding(pageencoding); //获取是否下载图片的标志 element dwdphoflagelement = root.getchild("dwdphoflag"); string dphoflag = dwdphoflagelement.gettext(); model.setdwdphoflag(dphoflag); //获取语言 element orien_lanelement = root.getchild("orien_lan"); string orien = orien_lanelement.gettext(); model.setoriglanguage(orien); element trans_lanelement = root.getchild("trans_lan"); string trans_lan = trans_lanelement.gettext(); model.settranlanguage(trans_lan); //获取url正则匹配 element pageurlregselement = root.getchild("pageurlregs"); list = new arraylist(); for (int i = 0; i < pageurlregselement.getchildren().size(); i++) { element element = (element) pageurlregselement.getchildren().get(i); list.add(element.gettext()); } model.setpagereg(list); //获取余下的匹配规则 map<string, list<string>> entityreg = new hashmap(); element pathelements = root.getchild("pathelements"); for (int i = 0; i < pathelements.getchildren().size(); i++) { element element = (element) pathelements.getchildren().get(i); list<string> pathlist = new arraylist(); string mapname = element.getname(); for (int j = 0; j < element.getchildren().size(); j++) { element childelement = (element) element.getchildren().get(j); pathlist.add(childelement.gettext()); } entityreg.put(mapname, pathlist); } model.setentityreg(entityreg); } catch (jdomexception ex) { logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex); } catch (ioexception ex) { logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex); } return model; } }
xml文件内容:
<?xml version="1.0" encoding="utf-8"?> <website name="taobao_mall" url="http://www.tmall.com/?ver=2011b"> <taskname>caiji_tmall_精品男装_t恤</taskname> <seeds> <seed> <sortname>精品男装/t恤</sortname> <seedname>精品男装/t恤</seedname> <seedurl>http://item.tmall.com/item.htm?id=9351702393</seedurl> </seed> </seeds> <extracthtml> <path>div class="list item-view item-miniview"</path> </extracthtml> <fiturls> <fit_url>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</fit_url> <fit_url>http://list\.tmall\.com/.*</fit_url> <fit_url>http://item\.tmall\.com/item\.htm.*</fit_url> </fiturls> <workingthreads>1</workingthreads> <pageencoding>utf-8</pageencoding> <orien_lan>zh</orien_lan> <trans_lan>en</trans_lan> <pageurlregs> <pageurl>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</pageurl> </pageurlregs> <pathelements> <commnents> <path>div class="tb-box tshop-psm tshop-psm-bdetailtabl" id="j_detail"</path> <path>div id="reviews" class="j_detailsection" data-reviewapi</path> </commnents> <shopaddr> <path>div class="clearfix tb-header-nav"</path> <path>div class="nav"</path> <path>a href</path> </shopaddr> <productdetail> <path>div id="attributes" class="attributes</path> <path>ul class="attributes-list</path> <path>li</path> </productdetail> <photospath> <path>div class="tb-detail-bd tb-clear"</path> <path>div class="tb-gallery"</path> <path>div class="tb-booth tb-pic tb-s310"</path> <path>img id="j_imgbooth" src</path> </photospath> <category> <path>ul class="mallcrumbs-nav" id="j_crumbs"</path> <path>li class="mallcrumbs-nav-item"</path> </category> <countsold> <path>div class="tb-detail-bd tb-clear"</path> <path>ul class="tb-meta"</path> <path>li class="tb-sold-out tb-clear"</path> </countsold> <shopinfo> <path>div class="shop-intro"</path> <path>div class="extend"</path> <path>li</path> </shopinfo> <despphos> <path>script</path> </despphos> <thumbphospath> <path>div class="tb-detail-bd tb-clear"</path> <path>div class="tb-gallery"</path> <path>ul id="j_ulthumb" class="tb-thumb tb-clearfix"</path> <path>img src=</path> </thumbphospath> <productname> <path>div class="layout grid-s5m0 "</path> <path>div class="tb-detail-hd"</path> <path>a target="_blank" href=</path> </productname> <productprice> <path>div class="tb-detail-bd tb-clear"</path> <path>ul class="tb-meta"</path> <path>li id="j_strpricemodbox" class="tb-detail-price tb-clearfix"</path> </productprice> </pathelements> </website>
其它类似信息

推荐信息