-
-
Notifications
You must be signed in to change notification settings - Fork 41
Cloud/K8s Support and Multi-Index RAG Expansion #133
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
3c61311
47a4e4d
b878e75
3a33d15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| { | ||
| "exclude_paths": [ | ||
| "banners/**", | ||
| "docs/**", | ||
| "modules/rootkit/**", | ||
| "parquets/**", | ||
| "persistent_chroma_db/**", | ||
| "sessions/**", | ||
| "static/**", | ||
| "test/**", | ||
| "external/**", | ||
| "**/*.png", | ||
| "**/*.ico", | ||
| "**/*.ttf", | ||
| "**/*.so", | ||
| "**/*.o", | ||
| "**/*.pyc", | ||
| "**/*.db", | ||
| "**/*.bin", | ||
| "**/*.exe", | ||
| "**/*.elf", | ||
| "**/*.macho", | ||
| "**/*.gz" | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| banners/ | ||
| docs/ | ||
| modules/rootkit/ | ||
| parquets/ | ||
| persistent_chroma_db/ | ||
| sessions/ | ||
| static/ | ||
| test/ | ||
| external/ | ||
| *.png | ||
| *.ico | ||
| *.ttf | ||
| *.so | ||
| *.o | ||
| *.pyc | ||
| *.db | ||
| *.bin | ||
| *.exe | ||
| *.elf | ||
| *.macho | ||
| *.gz |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| { | ||
| "env": { | ||
| "browser": true, | ||
| "es2021": true, | ||
| "node": true | ||
| }, | ||
| "extends": "eslint:recommended", | ||
| "parserOptions": { | ||
| "ecmaVersion": 12, | ||
| "sourceType": "module" | ||
| }, | ||
| "rules": {}, | ||
| "ignorePatterns": [ | ||
| "banners/", | ||
| "docs/", | ||
| "modules/rootkit/", | ||
| "parquets/", | ||
| "persistent_chroma_db/", | ||
| "sessions/", | ||
| "static/", | ||
| "*.png", | ||
| "*.ico", | ||
| "*.ttf", | ||
| "*.so", | ||
| "*.o" | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| [MASTER] | ||
| ignore=banners,docs,modules/rootkit,parquets,persistent_chroma_db,sessions,static,test,external | ||
| ignore-patterns=.*\.png,.*\.ico,.*\.ttf,.*\.so,.*\.o,.*\.pyc,.*\.db,.*\.bin,.*\.exe,.*\.elf,.*\.macho,.*\.gz |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -198,6 +198,26 @@ def update(self, reward: float, detection_prob: float) -> None: | |
| "Gemma 2 9B for output analysis, log parsing, and report synthesis." | ||
| ), | ||
| ), | ||
| ExpertProfile( | ||
| expert_id="groq_cloud", | ||
| backend="groq", | ||
| model="llama-3.3-70b-versatile", | ||
| capabilities=["cloud_enum", "cloud_exploit", "iam_analysis"], | ||
| base_weight=0.75, | ||
| cost_tier=2, | ||
| latency_ms=2000, | ||
| description="Specialized expert for AWS/Azure/GCP enumeration and IAM exploitation.", | ||
| ), | ||
| ExpertProfile( | ||
| expert_id="groq_container", | ||
| backend="groq", | ||
| model="llama-3.3-70b-versatile", | ||
| capabilities=["container_escape", "k8s_enum", "docker_audit"], | ||
| base_weight=0.75, | ||
| cost_tier=2, | ||
| latency_ms=2000, | ||
| description="Specialized expert for Kubernetes, Docker, and container escape techniques.", | ||
| ), | ||
|
Comment on lines
+201
to
+220
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CRITICAL_BUG] These experts expose capabilities like 'cloud_exploit' and 'iam_analysis' which imply handling sensitive cloud metadata and potentially credentials. Ensure there are safeguards preventing leakage of secrets to external/backed models (audit logs, redaction, and an explicit policy gating such tasks to trusted/local backends). Add capability-to-policy mapping in code that requires human approval or uses offline models for dangerous operations. # modules/moe_router.py (conceptual example – policy wiring likely lives elsewhere)
SENSITIVE_CAPABILITIES = {
"cloud_exploit": "offline_only",
"iam_analysis": "offline_preferred",
"container_escape": "offline_preferred",
}
# When selecting experts, enforce a policy check, e.g. in the router’s selection logic:
if any(cap in SENSITIVE_CAPABILITIES for cap in requested_caps):
# filter to trusted/local backends
candidates = [
e for e in candidates
if e.backend in ("ollama", "local_llm")
] |
||
| ] | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,6 +56,8 @@ class FindingType(str, Enum): | |
| DOMAIN = "domain" | ||
| EMAIL = "email" | ||
| ERROR = "error" | ||
| CLOUD_ROLE = "cloud_role" | ||
| K8S_RESOURCE = "k8s_resource" | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
@@ -315,6 +317,38 @@ def extract(self, text: str, host: str) -> List[Finding]: | |
| return results | ||
|
|
||
|
|
||
| class _CloudIdentityExtractor(Extractor): | ||
| """Extracts IAM roles, ARNs, and K8s resources from cloud tool output.""" | ||
| _PATTERNS = [ | ||
| # AWS ARN | ||
| re.compile(r'arn:aws:iam::\d{12}:[a-zA-Z0-9:/._-]+'), | ||
| # Azure Resource ID | ||
| re.compile(r'/subscriptions/[a-f0-9-]{36}/resourceGroups/[a-zA-Z0-9._-]+'), | ||
| # K8s resources | ||
| re.compile(r'\b(pod|deployment|service|namespace|secret)/[a-z0-9-]{1,63}\b'), | ||
| ] | ||
|
Comment on lines
+323
to
+329
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [VALIDATION] Azure resource regex is too permissive and doesn't anchor GUID structure. Current pattern '/subscriptions/[a-f0-9-]{36}/resourceGroups/...' will match noise and is case-sensitive. Use a more precise regex (with case-insensitive flag) to match GUID format: '/subscriptions/[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}/resourceGroups/[^/]+'. Also compile the pattern with re.IGNORECASE to accept uppercase hex in GUIDs. class _CloudIdentityExtractor(Extractor):
"""Extracts IAM roles, ARNs, and K8s resources from cloud tool output."""
_PATTERNS = [
# AWS ARN
re.compile(r"arn:aws:iam::\d{12}:[a-zA-Z0-9:/._-]+"),
# Azure Resource ID (GUID-anchored, case-insensitive)
re.compile(
r"/subscriptions/[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}/resourceGroups/[a-zA-Z0-9._-]+",
re.IGNORECASE,
),
# K8s resources
re.compile(r"\b(pod|deployment|service|namespace|secret)/[a-z0-9-]{1,63}\b"),
] |
||
|
|
||
| def extract(self, text: str, host: str) -> List[Finding]: | ||
| seen: set = set() | ||
| results: List[Finding] = [] | ||
| for pat in self._PATTERNS: | ||
| for m in pat.finditer(text): | ||
| val = m.group() | ||
| if val not in seen: | ||
| seen.add(val) | ||
| ftype = FindingType.CLOUD_ROLE | ||
| if "arn:aws" in val or "/subscriptions/" in val: | ||
| ftype = FindingType.CLOUD_ROLE | ||
| else: | ||
| ftype = FindingType.K8S_RESOURCE | ||
|
|
||
| results.append(Finding( | ||
| ftype, val, | ||
| host=host, confidence=0.95, raw=m.group() | ||
| )) | ||
| return results | ||
|
|
||
|
|
||
|
Comment on lines
+320
to
+351
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [VALIDATION] Extraction logic conflates different cloud artifact types and can mislabel Azure resource IDs as CLOUD_ROLE. Instead of defaulting ftype to CLOUD_ROLE and using a simple substring check, explicitly detect and classify: - AWS IAM ARNs (role/user/policy) -> separate FindingType (e.g. AWS_ARN or CLOUD_ROLE with subtype), - Azure resource IDs -> a distinct FindingType (e.g. AZURE_RESOURCE), - K8s matches -> K8S_RESOURCE. Prefer using regex capture groups to determine the exact kind (role vs service vs resource) and populate a subtype or metadata field rather than overloading CLOUD_ROLE. class _CloudIdentityExtractor(Extractor):
"""Extracts IAM roles, ARNs, Azure resources, and K8s resources from cloud tool output."""
_PATTERNS = [
# AWS IAM ARNs: role / user / policy / assumed-role
re.compile(r"arn:aws:iam::(?P<account>\d{12}):(?P<kind>role|user|policy|assumed-role)/(?P<name>[A-Za-z0-9+=,.@_\-/]+)"),
# Azure Resource ID (GUID-anchored, case-insensitive)
re.compile(
r"/subscriptions/(?P<sub>[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})/resourceGroups/(?P<rg>[^/]+)(?P<rest>/providers/[^\s'\"]+)?",
re.IGNORECASE,
),
# K8s resources
re.compile(r"\b(?P<kind>pod|deployment|service|namespace|secret)/(?P<name>[a-z0-9-]{1,63})\b"),
]
def extract(self, text: str, host: str) -> List[Finding]:
seen: set = set()
results: List[Finding] = []
for pat in self._PATTERNS:
for m in pat.finditer(text):
raw = m.group(0)
if raw in seen:
continue
seen.add(raw)
if raw.startswith("arn:aws:iam::"):
ftype = FindingType.CLOUD_ROLE
elif raw.lower().startswith("/subscriptions/"):
ftype = FindingType.CLOUD_ROLE # or a new AZURE_RESOURCE type if added
else:
ftype = FindingType.K8S_RESOURCE
results.append(Finding(
type=ftype,
value=raw,
host=host,
confidence=0.95,
raw=raw,
))
return results |
||
| # --------------------------------------------------------------------------- | ||
| # Success heuristic | ||
| # --------------------------------------------------------------------------- | ||
|
|
@@ -372,6 +406,7 @@ def __init__(self) -> None: | |
| _UsernameExtractor(), | ||
| _DomainExtractor(), | ||
| _ErrorExtractor(), | ||
| _CloudIdentityExtractor(), | ||
| ]: | ||
| self._registry.register(ext) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[CRITICAL_BUG] Confirm the model name and backend pairing are valid for your deployment: 'llama-3.3-70b-versatile' + backend='groq' may not be available or may require different identifiers / credentials. Add a startup-time validation (or fallback) that checks the model is reachable and the backend supports it to avoid runtime failures. For consistency follow the pattern used for ollama_reason where the model is read from env (see lines ~177-186) — prefer environment configuration for large/externally-hosted models so deployments can opt-out or swap models without code changes.