Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python semantic generator #596

Open
wants to merge 26 commits into
base: dev
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
eeb49c9
removed html parser
May 1, 2023
b7b7647
renamed SemanticGenerator to JavaSemanticGenerator
May 2, 2023
8cc8b3c
resolved conflict
May 2, 2023
9880152
added python semantic support
May 2, 2023
d09a4f6
commented html parser
May 2, 2023
856ef56
commented html parser
May 2, 2023
ab0becf
Merge branch 'removed_html_tagger' into python_semantic_generator
hiteshbedre May 2, 2023
ed94b8f
added html parser
May 2, 2023
5a1171a
Merge branch 'dev' into python_semantic_generator
hiteshbedre May 3, 2023
8dc09ef
added n-> -1 semantic
May 4, 2023
5b1800c
Merge branch 'dev' of https://github.com/Privado-Inc/privado-core int…
khemrajrathore May 4, 2023
432b23d
python sink semantic
khemrajrathore May 4, 2023
3cc4208
removed println
khemrajrathore May 4, 2023
aaa3464
updated semantic
khemrajrathore May 5, 2023
0f4519a
python - apply sink semantic only on leakage till the issue reported …
khemrajrathore May 8, 2023
48a3ddc
Merge branch 'dev' of https://github.com/Privado-Inc/privado-core int…
khemrajrathore May 8, 2023
b3d259c
Merge branch 'dev' of https://github.com/Privado-Inc/privado-core int…
khemrajrathore May 15, 2023
fcd619a
support for named argument
khemrajrathore May 16, 2023
a870ce8
Merge branch 'dev' of https://github.com/Privado-Inc/privado-core int…
khemrajrathore Jun 1, 2023
294cc28
named and length of arguments, used to generate semantic
khemrajrathore Jun 1, 2023
218c0e9
add - take the amalgation of all flows for a signature when generatin…
khemrajrathore Jun 1, 2023
63c941b
added test case
khemrajrathore Jun 1, 2023
d3ff4c3
test failing
khemrajrathore Jun 1, 2023
30e2802
added afterall
khemrajrathore Jun 1, 2023
0854ab9
Merge branch 'dev' of https://github.com/Privado-Inc/privado-core int…
khemrajrathore Jun 15, 2023
d508fed
fix - updated semantic for named argument
khemrajrathore Jun 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
python sink semantic
khemrajrathore committed May 4, 2023
commit 432b23d794cdd3fc3a7978bb6582fad7fad082e8
Original file line number Diff line number Diff line change
@@ -23,28 +23,18 @@

package ai.privado.languageEngine.java.semantic

import ai.privado.cache.{AppCache, RuleCache}
import ai.privado.cache.RuleCache
import ai.privado.entrypoint.PrivadoInput
import ai.privado.model.{CatLevelOne, Constants, InternalTag, Language, Semantic}
import io.joern.dataflowengineoss.DefaultSemantics
import ai.privado.model.{CatLevelOne, Constants, InternalTag}
import ai.privado.semantic.SemanticGenerator
import io.joern.dataflowengineoss.semanticsloader.{Parser, Semantics}
import io.shiftleft.codepropertygraph.generated.Cpg
import io.shiftleft.codepropertygraph.generated.nodes.{AstNode, Call, Method}
import io.shiftleft.semanticcpg.language._
import org.slf4j.LoggerFactory
import scala.collection.mutable

object JavaSemanticGenerator {
object JavaSemanticGenerator extends SemanticGenerator {

implicit val resolver: ICallResolver = NoResolve
private val logger = LoggerFactory.getLogger(getClass)

/** Utility to get the default semantics for dataflow queries
* @return
*/
def getDefaultSemantics: Semantics = {
DefaultSemantics()
}
private val logger = LoggerFactory.getLogger(getClass)

/** Utility to get the semantics (default + custom) using cpg for dataflow queries
*
@@ -53,109 +43,62 @@ object JavaSemanticGenerator {
* @return
*/
def getSemantics(cpg: Cpg, privadoScanConfig: PrivadoInput, ruleCache: RuleCache): Semantics = {
val lang = AppCache.repoLanguage
if (lang != Language.JAVA) {
getDefaultSemantics
} else {
val customSinkSemantics = cpg.call
val customSinkSemantics = getMaximumFlowSemantic(
cpg.call
.where(_.tag.nameExact(Constants.catLevelOne).valueExact(CatLevelOne.SINKS.name))
.callee
.map(generateSemanticForTaint(_, -1))
.dedup
.l
.sorted

val nonTaintingMethods = cpg.method.where(_.callIn).isExternal(true).fullName(".*:(void|boolean|long|int)\\(.*").l
)

var customNonTaintDefaultSemantics = List[String]()
var specialNonTaintDefaultSemantics = List[String]()
var customStringSemantics = List[String]()
var customNonPersonalMemberSemantics = List[String]()
var customNonTaintDefaultSemantics = Seq[String]()
var specialNonTaintDefaultSemantics = Seq[String]()
var customStringSemantics = Seq[String]()
var customNonPersonalMemberSemantics = Seq[String]()

if (!privadoScanConfig.disableRunTimeSemantics) {
customNonTaintDefaultSemantics = nonTaintingMethods
if (!privadoScanConfig.disableRunTimeSemantics) {
val nonTaintingMethods = cpg.method.where(_.callIn).isExternal(true).fullName(".*:(void|boolean|long|int)\\(.*").l
customNonTaintDefaultSemantics = getMaximumFlowSemantic(
nonTaintingMethods
.fullNameNot(".*\\.(add|put|<init>|set|get|append|store|insert|update|merge).*")
.map(generateSemanticForTaint(_))
.dedup
.l
.sorted
)

specialNonTaintDefaultSemantics = nonTaintingMethods
specialNonTaintDefaultSemantics = getMaximumFlowSemantic(
nonTaintingMethods
.fullName(".*\\.(add|put|set|get|append|store|insert|update|merge).*")
.map(generateSemanticForTaint(_, 0))
.dedup
.l
.sorted
)

customStringSemantics = cpg.method
customStringSemantics = getMaximumFlowSemantic(
cpg.method
.filter(_.isExternal)
.where(_.callIn)
.fullName(".*:java.lang.String\\(.*")
.fullNameNot(".*\\.set[A-Za-z_]*:.*")
.map(generateSemanticForTaint(_, -1))
.dedup
.l
.sorted

customNonPersonalMemberSemantics = generateNonPersonalMemberSemantics(cpg)
}
val semanticFromConfig = ruleCache.getRule.semantics.flatMap(generateSemantic).sorted

logger.debug("\nCustom Non taint default semantics")
customNonTaintDefaultSemantics.foreach(logger.debug)
logger.debug("\nCustom specialNonTaintDefaultSemantics semantics")
specialNonTaintDefaultSemantics.foreach(logger.debug)
logger.debug("\nCustom customStringSemantics semantics")
customStringSemantics.foreach(logger.debug)
logger.debug("\nCustom customNonPersonalMemberSemantics semantics")
customNonPersonalMemberSemantics.foreach(logger.debug)
logger.debug("\nCustom customSinkSemantics semantics")
customSinkSemantics.foreach(logger.debug)
logger.debug("\nCustom semanticFromConfig semantics")
semanticFromConfig.foreach(logger.debug)

val list =
customNonTaintDefaultSemantics ++ specialNonTaintDefaultSemantics ++ customStringSemantics ++ customNonPersonalMemberSemantics ++ customSinkSemantics ++ semanticFromConfig
val parsed = new Parser().parse(list.mkString("\n"))
val finalSemantics = JavaSemanticGenerator.getDefaultSemantics.elements ++ parsed
Semantics.fromList(finalSemantics)
}
}
)

/** Generate semantics for tainting passed argument based on the number of parameter in method signature
* @param method
* or call node \- complete signature of method
* @return
* \- semantic string
*/
private def generateSemanticForTaint(methodNode: AstNode, toTaint: Int = -2) = {
val (parameterSize, fullName) = {
methodNode match {
case method: Method => (method.parameter.size, method.fullName)
case call: Call => (call.argument.size, call.methodFullName)
case _ => (0, "NA")
}
customNonPersonalMemberSemantics = generateNonPersonalMemberSemantics(cpg)
}
val parameterSemantic = mutable.HashSet[String]()
for (i <- 0 until parameterSize) {
if (toTaint != -2)
parameterSemantic.add(s"$i->$toTaint")
parameterSemantic.add(s"$i->$i")
}
"\"" + fullName + "\" " + parameterSemantic.toList.sorted.mkString(" ").trim
}

/** Generate Semantic string based on input Semantic
* @param semantic
* \- semantic object containing semantic information
* @return
*/
private def generateSemantic(semantic: Semantic) = {
if (semantic.signature.nonEmpty) {
val generatedSemantic = "\"" + semantic.signature.trim + "\" " + semantic.flow
Some(generatedSemantic.trim)
} else
None
val semanticFromConfig = ruleCache.getRule.semantics.flatMap(generateSemantic).sorted

logger.debug("\nCustom Non taint default semantics")
customNonTaintDefaultSemantics.foreach(logger.debug)
logger.debug("\nCustom specialNonTaintDefaultSemantics semantics")
specialNonTaintDefaultSemantics.foreach(logger.debug)
logger.debug("\nCustom customStringSemantics semantics")
customStringSemantics.foreach(logger.debug)
logger.debug("\nCustom customNonPersonalMemberSemantics semantics")
customNonPersonalMemberSemantics.foreach(logger.debug)
logger.debug("\nCustom customSinkSemantics semantics")
customSinkSemantics.foreach(logger.debug)
logger.debug("\nCustom semanticFromConfig semantics")
semanticFromConfig.foreach(logger.debug)

val list =
customNonTaintDefaultSemantics ++ specialNonTaintDefaultSemantics ++ customStringSemantics ++ customNonPersonalMemberSemantics ++ customSinkSemantics ++ semanticFromConfig
val parsed = new Parser().parse(list.mkString("\n"))
val finalSemantics = JavaSemanticGenerator.getDefaultSemantics.elements ++ parsed
Semantics.fromList(finalSemantics)
}

/** Generates Semantics for non Personal member
@@ -165,30 +108,29 @@ object JavaSemanticGenerator {
*/
def generateNonPersonalMemberSemantics(cpg: Cpg): List[String] = {

val nonPersonalGetterSemantics = cpg.tag
.where(_.nameExact(InternalTag.INSENSITIVE_METHOD_RETURN.toString))
.call
.whereNot(_.tag.nameExact(InternalTag.SENSITIVE_METHOD_RETURN.toString))
.map(generateSemanticForTaint(_))
.dedup
.l
val nonPersonalGetterSemantics = getMaximumFlowSemantic(
cpg.tag
.where(_.nameExact(InternalTag.INSENSITIVE_METHOD_RETURN.toString))
.call
.whereNot(_.tag.nameExact(InternalTag.SENSITIVE_METHOD_RETURN.toString))
.map(generateSemanticForTaint(_))
).l

val nonPersonalSetterMethodFullNames =
val nonPersonalSetterMethodFullNames = getMaximumFlowSemantic(
cpg.tag
.where(_.nameExact(InternalTag.INSENSITIVE_SETTER.toString))
.call
.whereNot(_.nameExact(InternalTag.SENSITIVE_SETTER.toString))
.map(generateSemanticForTaint(_))
.dedup
.l
).l

val personalSetterMethodFullNames =
cpg.tag
.where(_.nameExact(InternalTag.SENSITIVE_SETTER.toString))
.call
.map(methodName => generateSemanticForTaint(methodName, 0))
.dedup
.l
getMaximumFlowSemantic(
cpg.tag
.where(_.nameExact(InternalTag.SENSITIVE_SETTER.toString))
.call
.map(methodName => generateSemanticForTaint(methodName, 0))
).l
(nonPersonalGetterSemantics ::: nonPersonalSetterMethodFullNames ::: personalSetterMethodFullNames).sorted
}
}
Original file line number Diff line number Diff line change
@@ -131,14 +131,6 @@ object PythonProcessor {
logger.debug(
s"Total Sinks identified : ${cpg.tag.where(_.nameExact(Constants.catLevelOne).valueExact(CatLevelOne.SINKS.name)).call.tag.nameExact(Constants.id).value.toSet}"
)
val codelist = cpg.call
.whereNot(_.methodFullName(Operators.ALL.asScala.toSeq: _*))
.map(item => (item.methodFullName, item.location.filename))
.dedup
.l
logger.debug(s"size of code : ${codelist.size}")
codelist.foreach(item => logger.debug(item._1, item._2))
logger.debug("Above we printed methodFullName")
Right(())
}

Original file line number Diff line number Diff line change
@@ -1,63 +1,35 @@
package ai.privado.languageEngine.python.semantic

import ai.privado.cache.RuleCache
import ai.privado.model.{Constants, Semantic}
import io.joern.dataflowengineoss.DefaultSemantics
import ai.privado.model.{CatLevelOne, Constants}
import ai.privado.semantic.SemanticGenerator
import io.joern.dataflowengineoss.semanticsloader.{Parser, Semantics}
import io.shiftleft.codepropertygraph.generated.Cpg
import io.shiftleft.semanticcpg.language._
import org.slf4j.LoggerFactory

object PythonSemanticGenerator {
object PythonSemanticGenerator extends SemanticGenerator {

implicit val resolver: ICallResolver = NoResolve
private val logger = LoggerFactory.getLogger(getClass)

/** Utility to get the default semantics for dataflow queries
*
* @return
*/
def getDefaultSemantics: Semantics = {
DefaultSemantics()
}
private val logger = LoggerFactory.getLogger(getClass)

def getSemantics(cpg: Cpg, ruleCache: RuleCache) = {
val leakageSinkSemantics = cpg.call
.where(_.tag.nameExact(Constants.id).value("Leakages.*"))
.l
.map(call => generateOneToOneSemanticForTaint(call.methodFullName, call.code))
.dedup
.sorted
val customSinkSemantics = getMaximumFlowSemantic(
cpg.call
.where(_.tag.nameExact(Constants.catLevelOne).valueExact(CatLevelOne.SINKS.name))
.map(generateSemanticForTaint(_, -1))
)

val semanticFromConfig = ruleCache.getRule.semantics.flatMap(generateSemantic).sorted

logger.debug("\nCustom customSinkSemantics semantics")
leakageSinkSemantics.foreach(logger.debug)
customSinkSemantics.foreach(logger.debug)
logger.debug("\nCustom semanticFromConfig semantics")
semanticFromConfig.foreach(logger.debug)

val list = leakageSinkSemantics ++ semanticFromConfig
val list = customSinkSemantics ++ semanticFromConfig
val parsed = new Parser().parse(list.mkString("\n"))
val finalSemantics = PythonSemanticGenerator.getDefaultSemantics.elements ++ parsed
Semantics.fromList(finalSemantics)
}

private def generateOneToOneSemanticForTaint(methodName: String, code: String) = {
var parameterNumber = code.count(_.equals(','))
if (parameterNumber <= 2)
parameterNumber = 5
var parameterSemantics = ""
for (i <- 1 to (parameterNumber))
parameterSemantics += s"$i->$i $i->-1 "
"\"" + methodName + "\" " + parameterSemantics.trim
}

private def generateSemantic(semantic: Semantic) = {
if (semantic.signature.nonEmpty) {
val generatedSemantic = "\"" + semantic.signature.trim + "\" " + semantic.flow
Some(generatedSemantic.trim)
} else
None
}

}
106 changes: 106 additions & 0 deletions src/main/scala/ai/privado/semantic/SemanticGenerator.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* This file is part of Privado OSS.
*
* Privado is an open source static code analysis tool to discover data flows in the code.
* Copyright (C) 2022 Privado, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* For more information, contact support@privado.ai
*
*/

package ai.privado.semantic

import ai.privado.model.Semantic
import io.joern.dataflowengineoss.DefaultSemantics
import io.joern.dataflowengineoss.semanticsloader.Semantics
import io.shiftleft.codepropertygraph.generated.nodes.{AstNode, Call, Method}
import io.shiftleft.semanticcpg.language._
import ai.privado.model.Language.UNKNOWN
import overflowdb.traversal.Traversal

import scala.collection.mutable
import scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable

trait SemanticGenerator {

implicit val resolver: ICallResolver = NoResolve

/** Utility to get the default semantics for dataflow queries
*
* @return
*/
def getDefaultSemantics: Semantics = {
DefaultSemantics()
}

/** Generate semantics for tainting passed argument based on the number of parameter in method signature
*
* @param method
* or call node \- complete signature of method
* @return
* \- semantic string
*/
def generateSemanticForTaint(methodNode: AstNode, toTaint: Int = -2) = {
val (parameterSize, fullName) = {
methodNode match {
case method: Method => (method.parameter.size, method.fullName)
case call: Call => (call.argument.size, call.methodFullName)
case _ => (0, "NA")
}
}
val parameterSemantic = mutable.HashSet[String]()
for (i <- 0 until parameterSize) {
if (toTaint != -2)
parameterSemantic.add(s"$i->$toTaint")
parameterSemantic.add(s"$i->$i")
}
Semantic(fullName, parameterSemantic.toList.sorted.mkString(" ").trim, "", UNKNOWN, Array())
}

/** Generate Semantic string based on input Semantic
*
* @param semantic
* \- semantic object containing semantic information
* @return
*/
def generateSemantic(semantic: Semantic): Option[String] = {
if (semantic.signature.nonEmpty) {
val generatedSemantic = "\"" + semantic.signature.trim + "\" " + semantic.flow
Some(generatedSemantic.trim)
} else
None
}

/** Takes sequence of semantic as input and returns the unique semantic by signature which have the longest flow
*
* ex - If we have 2 semantics with the same signature, we would want the maximum flow one
* 1. "logging.py:<module>.getLogger.<returnValue>.info" 0->-1 0->0 1->-1 1->1 2->-1 2->2 3->-1 3->3 4->-1 4->4
* 5->-1 5->5
*
* 2. "logging.py:<module>.getLogger.<returnValue>.info" 0->-1 0->0 1->-1 1->1 2->-1 2->2
*
* We want the output to be 1st one as it has the longer flow
* @param semantics
* @return
*/
def getMaximumFlowSemantic(semantics: Traversal[Semantic]): Seq[String] = {
semantics.l.par
.groupBy(_.signature)
.map(_._2.sortBy(_.flow).last)
.flatMap(generateSemantic)
.sorted
}
}