这是一个目前在做的项目需要使用的xml文件读写实现。记起来以备后忘和供有需要的同学学习。
xml文件读写类:
import java.io.file;
import java.io.fileoutputstream;
import java.io.ioexception;
import java.util.arraylist;
import java.util.hashmap;
import java.util.iterator;
import java.util.list;
import java.util.map;
import java.util.logging.level;
import java.util.logging.logger;
import org.jdom.document;
import org.jdom.element;
import org.jdom.jdomexception;
import org.jdom.input.saxbuilder;
import org.jdom.output.format;
import org.jdom.output.xmloutputter;
import org.lt.cj.config.entities.configmodel;
import org.lt.cj.config.entities.tmallconfigmodel;
import org.lt.cj.core.seed;
public class xmlconfigwriter {
/*创建淘宝商城的配置文件*/
public document buildupmalldocument(tmallconfigmodel missionconfig) throws missionconfigexception, enterurlsexception {
if (missionconfig == null) {
throw new missionconfigexception();
} else if (missionconfig.getseeds().isempty()) {
return null;
}
// create the root element
element rootelement = new element("website");
/* 设置网站属性 */
/* 设置网站名称 */
rootelement.setattribute("name", missionconfig.getwebsitename());
/*设置网站地址*/
rootelement.setattribute("url", missionconfig.getwebsiteurl());
//添加任务名称
element taskelement = new element("taskname");
taskelement.addcontent(missionconfig.gettaskname());
rootelement.addcontent(taskelement);
//构造种子列表节点
element seeds = new element("seeds");
for (int i = 0; i < missionconfig.getseeds().size(); i++) {
element seedelement = new element("seed");
element seednameelement = new element("seedname");
seednameelement.addcontent(missionconfig.getseeds().get(i).getseedname());
element seedurlelement = new element("seedurl");
seedurlelement.addcontent(missionconfig.getseeds().get(i).geturl());
element seedsortnameelement = new element("sortname");
seedsortnameelement.addcontent(missionconfig.getseeds().get(i).getsortname());
seedelement.addcontent(seedsortnameelement);
seedelement.addcontent(seednameelement);
seedelement.addcontent(seedurlelement);
seeds.addcontent(seedelement);
}
rootelement.addcontent(seeds);
//定义匹配的要采集的url链接fiturl的节点
element fiturls = new element("fiturls");
for (int i = 0; i < missionconfig.getfiturlregs().size(); i++) {
element fiturl = new element("fit_url");
fiturl.addcontent(missionconfig.getfiturlregs().get(i));
fiturls.addcontent(fiturl);
}
rootelement.addcontent(fiturls);//添加到根节点
//并发工作线程数
element workingthreadselement = new element("workingthreads");
workingthreadselement.addcontent("" + missionconfig.getworkingthreads());
rootelement.addcontent(workingthreadselement);//添加到根节点
//定义页面编码节点
element pageencodingelement = new element("pageencoding");
pageencodingelement.addcontent(missionconfig.getpageencoding());
rootelement.addcontent(pageencodingelement);//添加到根节点
//定义下载图片控制标志节点
element dwdphoflagelement = new element("dwdphoflag");
dwdphoflagelement.addcontent(missionconfig.getdwdphoflag());
rootelement.addcontent(dwdphoflagelement);
//定义原语言节点
element orilan = new element("orien_lan");
orilan.addcontent(missionconfig.getoriglanguage());
element translan = new element("trans_lan");
translan.addcontent(missionconfig.gettranlanguage());
rootelement.addcontent(orilan);//添加到根节点
rootelement.addcontent(translan);//添加到根节点
//定义匹配抓取信息的产品页面url节点
element pageurlregs = new element("pageurlregs");
for (int i = 0; i < missionconfig.getpagereg().size(); i++) {
element pageurl = new element("pageurl");
pageurl.addcontent(missionconfig.getfiturlregs().get(i));
pageurlregs.addcontent(pageurl);
}
rootelement.addcontent(pageurlregs);//添加到根节点
map<string, list<string>> map = missionconfig.getentityreg();
list<string> list = null;
element pathelements = new element("pathelements");
//直接循环算啦
//=====================================
iterator iter = map.entryset().iterator();
while (iter.hasnext()) {
map.entry e = (map.entry) iter.next();
element element = new element(e.getkey() + "");
map = missionconfig.getentityreg();
list = map.get(e.getkey() + "");
for (int i = 0; i < list.size(); i++) {
element path = new element("path");
path.addcontent(list.get(i));
element.addcontent(path);
}
pathelements.addcontent(element);
}
rootelement.addcontent(pathelements);
/* ===================================================== */
document mydocument = new document(rootelement);
return mydocument;
}
/* 创建文档文件 */
public void createconfigfile(document document, string filepath) {
try {
/* 定义xml输出器 */
xmloutputter xmloutputter = new xmloutputter();
xmloutputter.setformat(format.getprettyformat());
file file = new file(filepath);
if (!file.exists()) {
if (file.createnewfile()) {
fileoutputstream fileoutputstream = new fileoutputstream(filepath);
xmloutputter.output(document, fileoutputstream);
return;
}
}
fileoutputstream fileoutputstream = new fileoutputstream(filepath);
xmloutputter.output(document, fileoutputstream);
} catch (java.io.ioexception e) {
e.printstacktrace();
}
}
/* 重写文件 */
public void savetask(string filepath, configmodel configmodel) {
try {
tmallconfigmodel tmallconfigmodel = (tmallconfigmodel) configmodel;
document document = buildupmalldocument(tmallconfigmodel);
if (document != null) {
createconfigfile(document, filepath);
}
} catch (missionconfigexception ex) {
logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex);
} catch (enterurlsexception ex) {
logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex);
}
}
//* xml文件读取方法 */
public tmallconfigmodel readmalldocument(string filepath) {
tmallconfigmodel model = new tmallconfigmodel();
saxbuilder sb = new saxbuilder();
try {
//读取基本配置信息
document doc = sb.build(filepath); //构造文档对象
element root = doc.getrootelement(); //获取根元素
string websitename = root.getattributevalue("name"); //获取网站名称
string websiteaddr = root.getattributevalue("url"); //获取网站地址
model.setwebsitename(websitename); //设置网站名称
model.setwebsiteurl(websiteaddr); //设置网站地址
element tasknameelement = root.getchild("taskname"); //获取任务名内容
string taskname = tasknameelement.gettext();
model.settaskname(taskname);
//获取入口种子列表
list<seed> seedlist = new arraylist();
element seedselement = root.getchild("seeds");
list list = seedselement.getchildren();
for (int i = 0; i < list.size(); i++) {
element element = (element) seedselement.getchildren().get(i);
seed seed = new seed();
element seednameelement = element.getchild("seedname");
element seedurlelement = element.getchild("seedurl");
element seedsortnameelement = element.getchild("sortname");
seed.setseedname(seednameelement.gettexttrim());
seed.seturl(seedurlelement.gettexttrim());
seed.setsortname(seedsortnameelement.gettexttrim());
element parentseedelement = element.getchild("parentseed");
if (parentseedelement != null) {
seed parentseed = new seed();
element parentseednameelement = parentseedelement.getchild("seedname");
element parentseedurlelement = parentseedelement.getchild("seedurl");
element parentseedsortnameelement = parentseedelement.getchild("sortname");
parentseed.setseedname(parentseednameelement.gettext());
parentseed.seturl(parentseedurlelement.gettexttrim());
parentseed.setsortname(parentseedsortnameelement.gettexttrim());
}
seedlist.add(seed);
}
model.setseeds(seedlist);
//获取匹配的要抽取的页面的特定部分内容
list = new arraylist();
element extracthtmlelement = root.getchild("extracthtml");
if (extracthtmlelement != null) {
for (int i = 0; i < extracthtmlelement.getchildren().size(); i++) {
element element = (element) extracthtmlelement.getchildren().get(i);
list.add(element.gettext());
}
}
model.setextracthtmlreg(list);
//获取匹配urls
list = new arraylist();
element fiturlselement = root.getchild("fiturls");
for (int i = 0; i < fiturlselement.getchildren().size(); i++) {
element element = (element) fiturlselement.getchildren().get(i);
list.add(element.gettext());
}
model.setfiturlregs(list);
//获取线程数量
element workingthreadselement = root.getchild("workingthreads");
string workingcount = workingthreadselement.gettext();
model.setworkingthreads(integer.valueof(workingcount));
//获取解析编码
element pageencodingelement = root.getchild("pageencoding");
string pageencoding = pageencodingelement.gettext();
model.setpageencoding(pageencoding);
//获取是否下载图片的标志
element dwdphoflagelement = root.getchild("dwdphoflag");
string dphoflag = dwdphoflagelement.gettext();
model.setdwdphoflag(dphoflag);
//获取语言
element orien_lanelement = root.getchild("orien_lan");
string orien = orien_lanelement.gettext();
model.setoriglanguage(orien);
element trans_lanelement = root.getchild("trans_lan");
string trans_lan = trans_lanelement.gettext();
model.settranlanguage(trans_lan);
//获取url正则匹配
element pageurlregselement = root.getchild("pageurlregs");
list = new arraylist();
for (int i = 0; i < pageurlregselement.getchildren().size(); i++) {
element element = (element) pageurlregselement.getchildren().get(i);
list.add(element.gettext());
}
model.setpagereg(list);
//获取余下的匹配规则
map<string, list<string>> entityreg = new hashmap();
element pathelements = root.getchild("pathelements");
for (int i = 0; i < pathelements.getchildren().size(); i++) {
element element = (element) pathelements.getchildren().get(i);
list<string> pathlist = new arraylist();
string mapname = element.getname();
for (int j = 0; j < element.getchildren().size(); j++) {
element childelement = (element) element.getchildren().get(j);
pathlist.add(childelement.gettext());
}
entityreg.put(mapname, pathlist);
}
model.setentityreg(entityreg);
} catch (jdomexception ex) {
logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex);
} catch (ioexception ex) {
logger.getlogger(xmlconfigwriter.class.getname()).log(level.severe, null, ex);
}
return model;
}
}
xml文件内容:
<?xml version="1.0" encoding="utf-8"?>
<website name="taobao_mall" url="http://www.tmall.com/?ver=2011b">
<taskname>caiji_tmall_精品男装_t恤</taskname>
<seeds>
<seed>
<sortname>精品男装/t恤</sortname>
<seedname>精品男装/t恤</seedname>
<seedurl>http://item.tmall.com/item.htm?id=9351702393</seedurl>
</seed>
</seeds>
<extracthtml>
<path>div class="list item-view item-miniview"</path>
</extracthtml>
<fiturls>
<fit_url>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</fit_url>
<fit_url>http://list\.tmall\.com/.*</fit_url>
<fit_url>http://item\.tmall\.com/item\.htm.*</fit_url>
</fiturls>
<workingthreads>1</workingthreads>
<pageencoding>utf-8</pageencoding>
<orien_lan>zh</orien_lan>
<trans_lan>en</trans_lan>
<pageurlregs>
<pageurl>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</pageurl>
</pageurlregs>
<pathelements>
<commnents>
<path>div class="tb-box tshop-psm tshop-psm-bdetailtabl" id="j_detail"</path>
<path>div id="reviews" class="j_detailsection" data-reviewapi</path>
</commnents>
<shopaddr>
<path>div class="clearfix tb-header-nav"</path>
<path>div class="nav"</path>
<path>a href</path>
</shopaddr>
<productdetail>
<path>div id="attributes" class="attributes</path>
<path>ul class="attributes-list</path>
<path>li</path>
</productdetail>
<photospath>
<path>div class="tb-detail-bd tb-clear"</path>
<path>div class="tb-gallery"</path>
<path>div class="tb-booth tb-pic tb-s310"</path>
<path>img id="j_imgbooth" src</path>
</photospath>
<category>
<path>ul class="mallcrumbs-nav" id="j_crumbs"</path>
<path>li class="mallcrumbs-nav-item"</path>
</category>
<countsold>
<path>div class="tb-detail-bd tb-clear"</path>
<path>ul class="tb-meta"</path>
<path>li class="tb-sold-out tb-clear"</path>
</countsold>
<shopinfo>
<path>div class="shop-intro"</path>
<path>div class="extend"</path>
<path>li</path>
</shopinfo>
<despphos>
<path>script</path>
</despphos>
<thumbphospath>
<path>div class="tb-detail-bd tb-clear"</path>
<path>div class="tb-gallery"</path>
<path>ul id="j_ulthumb" class="tb-thumb tb-clearfix"</path>
<path>img src=</path>
</thumbphospath>
<productname>
<path>div class="layout grid-s5m0 "</path>
<path>div class="tb-detail-hd"</path>
<path>a target="_blank" href=</path>
</productname>
<productprice>
<path>div class="tb-detail-bd tb-clear"</path>
<path>ul class="tb-meta"</path>
<path>li id="j_strpricemodbox" class="tb-detail-price tb-clearfix"</path>
</productprice>
</pathelements>
</website>