CMS识别原理
CMS英文全称是:Content Management System,中文名称是:网站内容管理系统。CMS识别原理就是得到一些CMS的一些固有特征,通过得到这个特征来判断CMS的类别。
这里我们采用MD5识别和正则表达式识别的方式,具体来说就是用特定的文件路径访问网站,或者这个文件的MD5或者用正则表达式匹配某个关键词,如果匹配成功就说明是这个CMS。所以识别的成功率与字典有关
指纹格式
这里截取一些Web指纹作为参考:
{
"url": "/install/",
"re": "aspcms",
"name": "AspCMS",
"md5": ""
},
{
"url": "/about/_notes/dwsync.xml",
"re": "aspcms",
"name": "AspCMS",
"md5": ""
},
{
"url": "/admin/_Style/_notes/dwsync.xml",
"re": "aspcms",
"name": "AspCMS",
"md5": ""
},
{
"url": "/apply/_notes/dwsync.xml",
"re": "aspcms",
"name": "AspCMS",
"md5": ""
},
{
"url": "/tpl/green/common/images/notebg.jpg",
"re": "",
"name": "自动发卡平台",
"md5": "690f337298c331f217c0407cc11620e9"
},
{
"url": "/images/download.png",
"re": "",
"name": "全程oa",
"md5": "9921660baaf9e0b3b747266eb5af880f"
},
{
"url": "/kindeditor/license.txt",
"re": "",
"name": "T-Site建站系统",
"md5": "b0d181292c99cf9bb2ae9166dd3a0239"
},
{
"url": "/public/ico/favicon.png",
"re": "",
"name": "悟空CRM",
"md5": "834089ffa1cd3a27b920a335d7c067d7"
},
{
"url": "/public/js/php/file_manager_json.php",
"re": "",
"name": "悟空CRM",
"md5": "c64fd0278d72826eb9041773efa1f587"
},
{
"url": "/plugins/weathermap/images/exclamation.png",
"re": "",
"name": "CactiEZ插件",
"md5": "2e25cb083312b0eabfa378a89b07cd03"
}
指纹文件
在 data 目录下存放 data.json 文件格式的Web指纹,总共有1400+的国内常见指纹,[下载地址]()
代码编写
思路虽然简单,但实现起来还有很多问题,比如效率,1000+指纹说明需要访问1000+的网页,单步的话速度太慢,所以需要使用多线程,等用多了也会发现多线程也太慢了,所以可以使用协程,以后再慢慢优化,这里就使用多线程就行了
新建文件 lib/core/webcms.py ,代码如下
# __author__ = 'mathor'
import json, os, sys, hashlib, threading, queue
from lib.core import Download
class webcms(object):
workQueue = queue.Queue()
URL = ""
threadNum = 0
NotFound = True
Downloader = Download.Downloader()
result = ""
def __init__(self, url, threadNum = 10):
self.URL = url
self.threadNum = threadNum
filename = os.path.join(sys.path[0], 'data', 'data.json')
fp = open(filename, encoding = 'utf-8')
webdata = json.load(fp, encoding = 'UTF-8')
for i in webdata:
self.workQueue.put(i)
fp.close
def getmd5(self, body):
m2 = hashlib.md5()
m2.update(body)
return m2.hexdigest()
def th_whatweb(self):
if (self.workQueue.empty()):
self.NotFound = False
return False
if (self.NotFound is False):
return False
cms = self.workQueue.get()
_url = self.URL + cms['url']
html = self.Downloader.get(_url)
print("[whatweb log]:checking %s" % _url)
if (html is None):
return False
if cms['re']:
if (html.find(cms['re']) != -1):
self.result = cms['name']
self.NotFound = False
return True
else:
md5 = self.getmd5(html)
if (md5 == cms['md5']):
self.result = cms['name']
self.NotFound = False
return True
def run(self):
while(self.NotFound):
th = []
for i in range(self.threadNum):
t = threading.Thread(target = self.th_whatweb)
t.start()
th.append(t)
for t in th:
t.join()
if (self.result):
print("[webcms]:%s cms is %s" % (self.URL, self.result))
else:
print("[webcms]:%s cms NOTFound!" % self.URL)
调用
重写主文件 w8ay.py
#-*- coding:utf-8 -*-
'''
Name: w8ayScan
Author: mathor
Copyright (c)
'''
import sys
from lib.core.Spider import SpiderMain
from lib.core import webcms
def main():
root = ""
threadNum = 1000
# webcms
ww = webcms.webcms(root, threadNum)
ww.run()
# spider
w8 = SpiderMain(root, threadNum)
w8.craw()
if __name__ == "__main__":
main()
如果觉得《php实现指纹识别 CMS识别(Web指纹识别)扫描器开发》对你有帮助,请点赞、收藏,并留下你的观点哦!