nodejs简单实现中英文翻译

考虑到具体实现方面的问题，如果智能的话，肯定是要做中文的语法分析，不过感觉这个有难度。
所以最后的方案是遍历文件，将中文短语匹配出来，再进行人工翻译，将中文短语替换成翻译的内容。当然后期还是需要人工再检验下，毕竟代码中的中文，可能会影响到相关的程序。
这个问题，明显涉及到多线程，文件读写，第一时间就想到的是 nodejs，虽然nodejs是一个主线程，但是异步文件读写，事件响应机制，肯定也是调用了线程，在实际编程的时候不需要考虑线程的相关的问题。
代码不复杂如下，写完了之后，适当的封装了下
var fs = require('fs');var http = require('http');var filepath = 'd:\\work_new\\';var logpath = 'd:\\chinese.log';var map = {};var num = 0;var dictionary = (function () { var map = {}; return { logpath: 'd:\\chinese.log', set: function (key, val) { map[key] = val || ''; }, get: function (key) { return map[key]||''; }, save2file: function () { fs.writefile(this.logpath, json.stringify(map).replace(/","/g,'",\r\n"'),{encoding:'utf8',flag:'w'}, function (err) { if (err) throw err; }); }, loadfile: function (callback) { fs.readfile(this.logpath, {encoding:'utf8'},function (err, data) { map = json.parse(data); callback(); }) }, translatebygoogle: function (callback) { var index = 0; for (var key in map) { if (map[key] == '') { index++; (function (key) { http.get("http://translate.google.cn/translate_a/t?client=t&hl=zh-cn&sl=zh-cn&tl=en&ie=utf-8&oe=utf-8&oc=2&otf=1&ssel=3&tsel=6&sc=2&q="+key, function(res) { res.setencoding('utf8'); var body = ""; res.on('data', function (chunk) { body+=chunk; }).on('end', function (){ var obj = eval('('+body+')'); map[key] = obj[0][0][0]; index--; if (index == 0) { callback(); } }); }).on('error', function(e) { console.log('http error'); index--; if (index == 0) { callback(); } console.log("got error: " + e.message); }); })(key); } } } }})();function file () { var index = 0; var _readfile = function (pathstr, fileback, doneback) { fs.readfile(pathstr,{encoding:'utf8'}, function (err, data) { index--; if (err) { data = ""; console.log(err,pathstr) //throw err; } fileback(data,pathstr); if (index == 0) { doneback(); } }); }; var _walkdir = function (pathstr, fileback, doneback) { fs.readdir(pathstr, function (err, files) { files.foreach(function (file) { if(fs.statsync(pathstr + '/' + file).isdirectory()){ _walkdir(pathstr + '/' + file, fileback, doneback); } else { if (/.js$|.html$|.htm$|.jsp$/.test(file)){ index ++; _readfile(pathstr + '/' + file, fileback, doneback); } return; } }); }); } this.walkdir = function (pathstr, fileback, doneback) { index = 0; _walkdir(pathstr, fileback, doneback); }}//第一步获取中文dictionary.logpath = logpath;new file().walkdir(filepath, function (data) { if (!!data) { var match = data.match(/[\u4e00-\u9faf]+/g); if (!!match) { match.foreach(function (mat) { dictionary.set(mat); }) } }}, function () { console.log('获取中文 ok'); dictionary.save2file();})//第二步 google翻译/*dictionary.loadfile(function () { dictionary.translatebygoogle(function () { dictionary.save2file(); })});*///第三步中文替换/*dictionary.loadfile(function () { new file().walkdir(filepath, function (data,pathstr) { fs.writefile(pathstr, data.replace(/[\u4e00-\u9faf]+/g, function (ch) { return dictionary.get(ch); }),{encoding:'ascii',flag:'w'}, function (err) { if (err) throw err; }); }, function () { console.log('中文替换 ok'); })});*/
问题还是有的
1.nodejs编码问题，在window环境下对gbk编码支持不好，主要是utf8文件的处理
2.效率上面可能可以再通过线程进行优化，这块没做深入的考虑
3.匹配出来，可能有单个的标点符号的短语等情况，需要人工排查
实际情况中，文件是gbk的，还有些文件是utf8的，后来还是考虑通过脚本语言快手实现的时候，
1.文件编码的问题，判断通过搜索
判断文件首位3个字节是不是 ef bb bf，但是这个只是针对有bom的utf8格式
对无bom的utf8格式，需要进行字节特征码的判断（有难度，精力有限，使用了上面的方案，对于无bom的情况，进行人工排查）。
2.因为快手多线程方便编程很简单，一直以为多线程肯定比单线程效率要好。实际情况却和想的不一样，单线程的比多线程的快多了。看来主要瓶颈还是在读写文件io上面。
以上所述就是本文全部内容了，希望大家能够喜欢。
【相关教程推荐】
1. javascript视频教程
2. javascript在线手册
3. bootstrap教程

nodejs简单实现中英文翻译

推荐信息