From f06eae7c4138000ad44a4d94084d3908f6d505e2 Mon Sep 17 00:00:00 2001
From: Matt Brown <github@muglug.com>
Date: Mon, 25 Mar 2024 01:11:56 -0400
Subject: [PATCH] Improve taint analysis a little more

---
 src/analyzer/expr/binop/concat_analyzer.rs    |   2 +-
 src/analyzer/expr/call/arguments_analyzer.rs  |  24 ++
 .../existing_atomic_method_call_analyzer.rs   |  48 +--
 .../call/function_call_return_type_fetcher.rs | 378 +++++++++++++-----
 .../expr/fetch/array_fetch_analyzer.rs        |  38 +-
 src/analyzer/stmt_analyzer.rs                 |  18 +-
 src/code_info_builder/lib.rs                  |  15 +-
 src/str/build.rs                              | 150 +++++++
 8 files changed, 525 insertions(+), 148 deletions(-)

diff --git a/src/analyzer/expr/binop/concat_analyzer.rs b/src/analyzer/expr/binop/concat_analyzer.rs
index 83f8ea51..2fec7f9e 100644
--- a/src/analyzer/expr/binop/concat_analyzer.rs
+++ b/src/analyzer/expr/binop/concat_analyzer.rs
@@ -145,7 +145,7 @@ pub(crate) fn analyze_concat_nodes(
     result_type
 }
 
-fn get_concat_nodes(expr: &aast::Expr<(), ()>) -> Vec<&aast::Expr<(), ()>> {
+pub(crate) fn get_concat_nodes(expr: &aast::Expr<(), ()>) -> Vec<&aast::Expr<(), ()>> {
     match &expr.2 {
         aast::Expr_::Binop(x) => {
             let (binop, e1, e2) = (&x.bop, &x.lhs, &x.rhs);
diff --git a/src/analyzer/expr/call/arguments_analyzer.rs b/src/analyzer/expr/call/arguments_analyzer.rs
index ede54664..2b7f55d9 100644
--- a/src/analyzer/expr/call/arguments_analyzer.rs
+++ b/src/analyzer/expr/call/arguments_analyzer.rs
@@ -183,6 +183,7 @@ pub(crate) fn check_arguments_match(
             || matches!(functionlike_info.effects, FnEffect::Arg(_))
             || functionlike_info.pure_can_throw
             || functionlike_info.user_defined
+            || functionlike_info.method_info.is_some()
         {
             context.inside_general_use = true;
         }
@@ -1113,6 +1114,29 @@ fn handle_possibly_matching_inout_param(
             vec![],
             vec![],
         );
+    } else if matches!(
+        functionlike_id,
+        FunctionLikeIdentifier::Function(StrId::JSON_DECODE_WITH_ERROR)
+    ) && argument_offset == 1
+    {
+        let argument_node = DataFlowNode::get_for_method_argument(
+            functionlike_id.to_string(statements_analyzer.get_interner()),
+            0,
+            Some(statements_analyzer.get_hpos(all_args[1].1.pos())),
+            Some(statements_analyzer.get_hpos(function_call_pos)),
+        );
+
+        analysis_data
+            .data_flow_graph
+            .add_node(argument_node.clone());
+
+        analysis_data.data_flow_graph.add_path(
+            &argument_node,
+            &out_node,
+            PathKind::Aggregate,
+            vec![],
+            vec![],
+        );
     }
 
     analysis_data.data_flow_graph.add_node(out_node);
diff --git a/src/analyzer/expr/call/existing_atomic_method_call_analyzer.rs b/src/analyzer/expr/call/existing_atomic_method_call_analyzer.rs
index 2c5b582d..37cac51a 100644
--- a/src/analyzer/expr/call/existing_atomic_method_call_analyzer.rs
+++ b/src/analyzer/expr/call/existing_atomic_method_call_analyzer.rs
@@ -23,7 +23,9 @@ use oxidized::{
 };
 use rustc_hash::FxHashMap;
 
-use crate::expr::fetch::array_fetch_analyzer::add_array_fetch_dataflow;
+use crate::expr::fetch::array_fetch_analyzer::{
+    add_array_fetch_dataflow, get_array_access_type_given_offset,
+};
 use crate::stmt_analyzer::AnalysisError;
 use crate::{
     expr::{
@@ -468,31 +470,31 @@ fn handle_shapes_static_method(
                     .get_rc_expr_type(call_expr.1[1].1.pos())
                     .cloned();
 
-                let mut expr_type = None;
-
                 if let (Some(dict_type), Some(dim_type)) = (dict_type, dim_type) {
-                    for atomic_type in &dict_type.types {
-                        if let TAtomic::TDict { .. } = atomic_type {
-                            let expr_type_inner = handle_array_access_on_dict(
-                                statements_analyzer,
-                                pos,
-                                analysis_data,
-                                context,
-                                atomic_type,
-                                &dim_type,
-                                false,
-                                &mut false,
-                                true,
-                                &mut false,
-                                &mut false,
-                            );
-
-                            expr_type = Some(expr_type_inner);
-                        }
-                    }
+                    let mut expr_type_inner = get_array_access_type_given_offset(
+                        statements_analyzer,
+                        analysis_data,
+                        (&call_expr.1[0].1, Some(&call_expr.1[1].1), pos),
+                        &dict_type,
+                        &dim_type,
+                        false,
+                        &None,
+                        context,
+                    );
+
+                    add_array_fetch_dataflow(
+                        statements_analyzer,
+                        call_expr.1[0].1.pos(),
+                        analysis_data,
+                        None,
+                        &mut expr_type_inner,
+                        &mut (*dim_type).clone(),
+                    );
+
+                    return Some(expr_type_inner);
                 }
 
-                return Some(expr_type.unwrap_or(get_mixed_any()));
+                return Some(get_mixed_any());
             }
         }
         StrId::TO_DICT | StrId::TO_ARRAY => {
diff --git a/src/analyzer/expr/call/function_call_return_type_fetcher.rs b/src/analyzer/expr/call/function_call_return_type_fetcher.rs
index 19af4adf..e3b462bb 100644
--- a/src/analyzer/expr/call/function_call_return_type_fetcher.rs
+++ b/src/analyzer/expr/call/function_call_return_type_fetcher.rs
@@ -25,7 +25,7 @@ use std::collections::BTreeMap;
 use std::path::Path;
 use std::sync::Arc;
 
-use crate::expr::binop::concat_analyzer::analyze_concat_nodes;
+use crate::expr::binop::concat_analyzer::{analyze_concat_nodes, get_concat_nodes};
 use crate::expr::fetch::array_fetch_analyzer::handle_array_access_on_dict;
 use crate::expr::variable_fetch_analyzer;
 use crate::function_analysis_data::FunctionAnalysisData;
@@ -426,73 +426,41 @@ fn handle_special_functions(
                 None
             }
         }
-        &StrId::LIB_STR_FORMAT => {
+        &StrId::LIB_STR_FORMAT | &StrId::SPRINTF => {
             if let Some(first_arg) = args.first() {
-                if let aast::Expr_::String(simple_string) = &first_arg.1 .2 {
-                    let mut escaped = false;
-                    let mut in_format_string = false;
-
-                    let mut literals = vec![];
-
-                    let mut cur_literal = "".to_string();
-
-                    for c in simple_string.iter().copied() {
-                        if in_format_string {
-                            in_format_string = false;
-                            continue;
-                        }
-
-                        if !escaped {
-                            if c as char == '%' {
-                                in_format_string = true;
-                                literals.push(aast::Expr(
-                                    (),
-                                    first_arg.1.pos().clone(),
-                                    aast::Expr_::String(BString::from(cur_literal)),
-                                ));
-                                cur_literal = "".to_string();
-                                continue;
-                            }
+                match &first_arg.1 .2 {
+                    aast::Expr_::String(simple_string) => {
+                        return Some(handle_str_format(
+                            simple_string,
+                            first_arg,
+                            args,
+                            statements_analyzer,
+                            analysis_data,
+                            pos,
+                        ));
+                    }
+                    aast::Expr_::Binop(boxed) => {
+                        let mut concat_nodes = get_concat_nodes(&boxed.lhs);
+                        concat_nodes.push(&boxed.rhs);
 
-                            if c as char == '\\' {
-                                escaped = true;
-                            }
+                        let mut more_complex_string = BString::new(vec![]);
 
-                            in_format_string = false;
-                        } else {
-                            if c as char == '\\' {
-                                cur_literal += "\\";
-                                escaped = false;
-                                continue;
+                        for concat_node in concat_nodes {
+                            if let aast::Expr_::String(simple_string) = &concat_node.2 {
+                                more_complex_string.append(&mut simple_string.clone());
                             }
-
-                            escaped = false;
                         }
 
-                        cur_literal += (c as char).to_string().as_str();
-                    }
-
-                    literals.push(aast::Expr(
-                        (),
-                        first_arg.1.pos().clone(),
-                        aast::Expr_::String(BString::from(cur_literal)),
-                    ));
-
-                    let mut concat_args = vec![];
-
-                    for (i, literal) in literals.iter().enumerate() {
-                        concat_args.push(literal);
-                        if let Some(arg) = args.get(i + 1) {
-                            concat_args.push(&arg.1);
-                        } else {
-                            break;
-                        }
+                        return Some(handle_str_format(
+                            &more_complex_string,
+                            first_arg,
+                            args,
+                            statements_analyzer,
+                            analysis_data,
+                            pos,
+                        ));
                     }
-
-                    let result_type =
-                        analyze_concat_nodes(concat_args, statements_analyzer, analysis_data, pos);
-
-                    return Some(result_type);
+                    _ => (),
                 }
             }
 
@@ -631,6 +599,76 @@ fn handle_special_functions(
     }
 }
 
+fn handle_str_format(
+    simple_string: &BString,
+    first_arg: &(ast_defs::ParamKind, aast::Expr<(), ()>),
+    args: &Vec<(ast_defs::ParamKind, aast::Expr<(), ()>)>,
+    statements_analyzer: &StatementsAnalyzer<'_>,
+    analysis_data: &mut FunctionAnalysisData,
+    pos: &Pos,
+) -> TUnion {
+    let mut escaped = false;
+    let mut in_format_string = false;
+    let mut literals = vec![];
+    let mut cur_literal = "".to_string();
+
+    for c in simple_string.iter().copied() {
+        if in_format_string {
+            in_format_string = false;
+            continue;
+        }
+
+        if !escaped {
+            if c as char == '%' {
+                in_format_string = true;
+                literals.push(aast::Expr(
+                    (),
+                    first_arg.1.pos().clone(),
+                    aast::Expr_::String(BString::from(cur_literal)),
+                ));
+                cur_literal = "".to_string();
+                continue;
+            }
+
+            if c as char == '\\' {
+                escaped = true;
+            }
+
+            in_format_string = false;
+        } else {
+            if c as char == '\\' {
+                cur_literal += "\\";
+                escaped = false;
+                continue;
+            }
+
+            escaped = false;
+        }
+
+        cur_literal += (c as char).to_string().as_str();
+    }
+
+    literals.push(aast::Expr(
+        (),
+        first_arg.1.pos().clone(),
+        aast::Expr_::String(BString::from(cur_literal)),
+    ));
+
+    let mut concat_args = vec![];
+
+    for (i, literal) in literals.iter().enumerate() {
+        concat_args.push(literal);
+        if let Some(arg) = args.get(i + 1) {
+            concat_args.push(&arg.1);
+        } else {
+            break;
+        }
+    }
+
+    let result_type = analyze_concat_nodes(concat_args, statements_analyzer, analysis_data, pos);
+    result_type
+}
+
 fn get_type_structure_type(
     statements_analyzer: &StatementsAnalyzer,
     first_expr_type: &TUnion,
@@ -755,7 +793,17 @@ fn add_dataflow(
 
     data_flow_graph.add_node(function_call_node.clone());
 
-    let (param_offsets, variadic_path) = get_special_argument_nodes(functionlike_id, expr);
+    let (param_offsets, variadic_path) =
+        if !functionlike_storage.user_defined && (!expr.2.is_empty() || expr.3.is_some()) {
+            get_special_argument_nodes(
+                functionlike_id,
+                expr,
+                functionlike_storage,
+                statements_analyzer.get_interner(),
+            )
+        } else {
+            (vec![], None)
+        };
 
     let added_removed_taints = if let GraphKind::WholeProgram(_) = &data_flow_graph.kind {
         get_special_added_removed_taints(functionlike_id, statements_analyzer.get_interner())
@@ -893,6 +941,8 @@ fn get_special_argument_nodes(
         &Vec<(ast_defs::ParamKind, aast::Expr<(), ()>)>,
         &Option<aast::Expr<(), ()>>,
     ),
+    _functionlike_info: &FunctionLikeInfo,
+    _interner: &Interner,
 ) -> (Vec<(usize, PathKind)>, Option<PathKind>) {
     match functionlike_id {
         FunctionLikeIdentifier::Function(function_name) => match *function_name {
@@ -904,9 +954,6 @@ fn get_special_argument_nodes(
             | StrId::TRIM
             | StrId::LTRIM
             | StrId::RTRIM
-            | StrId::LIB_STR_TRIM
-            | StrId::LIB_STR_TRIM_LEFT
-            | StrId::LIB_STR_TRIM_RIGHT
             | StrId::LIB_STR_LOWERCASE
             | StrId::LIB_STR_UPPERCASE
             | StrId::LIB_STR_CAPITALIZE
@@ -945,7 +992,6 @@ fn get_special_argument_nodes(
             | StrId::CHOP
             | StrId::CONVERT_UUDECODE
             | StrId::CONVERT_UUENCODE
-            | StrId::JSON_DECODE
             | StrId::BASE64_ENCODE
             | StrId::BASE64_DECODE
             | StrId::URLENCODE
@@ -982,11 +1028,38 @@ fn get_special_argument_nodes(
             | StrId::IP2LONG
             | StrId::BIN2HEX
             | StrId::HEX2BIN
-            | StrId::ESCAPESHELLARG => (vec![(0, PathKind::Default)], None),
-            StrId::LIB_REGEX_FIRST_MATCH => (vec![(0, PathKind::Default)], Some(PathKind::Default)),
+            | StrId::ESCAPESHELLARG
+            | StrId::FIXME_UNSAFE_CAST
+            | StrId::LIB_DICT_COUNT_VALUES
+            | StrId::LIB_DICT_UNIQUE
+            | StrId::LIB_STR_REVERSE
+            | StrId::LIB_VEC_CAST_CLEAR_LEGACY_ARRAY_MARK
+            | StrId::CLASS_METH_GET_CLASS
+            | StrId::CLASS_METH_GET_METHOD
+            | StrId::CHR
+            | StrId::DECBIN
+            | StrId::DECHEX
+            | StrId::FB_SERIALIZE
+            | StrId::HEXDEC
+            | StrId::LZ4_COMPRESS
+            | StrId::LZ4_UNCOMPRESS
+            | StrId::RAWURLDECODE
+            | StrId::UTF8_DECODE
+            | StrId::UTF8_ENCODE
+            | StrId::STREAM_GET_META_DATA
+            | StrId::DIRNAME => (vec![(0, PathKind::Default)], None),
+            StrId::LIB_REGEX_FIRST_MATCH
+            | StrId::LIB_DICT_MERGE
+            | StrId::ARRAY_MERGE
+            | StrId::LIB_VEC_CONCAT
+            | StrId::LIB_KEYSET_UNION
+            | StrId::PACK
+            | StrId::UNPACK
+            | StrId::JSON_DECODE => (vec![(0, PathKind::Default)], Some(PathKind::Default)),
             StrId::LIB_DICT_SELECT_KEYS
             | StrId::LIB_VEC_TAKE
             | StrId::LIB_DICT_TAKE
+            | StrId::LIB_KEYSET_TAKE
             | StrId::LIB_STR_SLICE
             | StrId::LIB_STR_FORMAT_NUMBER
             | StrId::LIB_DICT_DIFF_BY_KEY
@@ -995,6 +1068,8 @@ fn get_special_argument_nodes(
             | StrId::LIB_VEC_DIFF
             | StrId::LIB_KEYSET_DIFF
             | StrId::LIB_KEYSET_INTERSECT
+            | StrId::LIB_DICT_DROP
+            | StrId::LIB_KEYSET_DROP
             | StrId::LIB_VEC_INTERSECT
             | StrId::LIB_VEC_SLICE
             | StrId::LIB_VEC_RANGE
@@ -1004,9 +1079,26 @@ fn get_special_argument_nodes(
             | StrId::LIB_STR_STRIP_SUFFIX
             | StrId::LIB_STR_REPEAT
             | StrId::SUBSTR
-            | StrId::LIB_DICT_ASSOCIATE => {
-                (vec![(0, PathKind::Default)], Some(PathKind::Aggregate))
-            }
+            | StrId::LIB_DICT_ASSOCIATE
+            | StrId::GZCOMPRESS
+            | StrId::GZDECODE
+            | StrId::GZDEFLATE
+            | StrId::GZUNCOMPRESS
+            | StrId::JSON_DECODE_WITH_ERROR
+            | StrId::LIB__PRIVATE_REGEX_MATCH
+            | StrId::LIB_STR_TRIM
+            | StrId::LIB_STR_TRIM_LEFT
+            | StrId::LIB_STR_TRIM_RIGHT
+            | StrId::BASENAME => (vec![(0, PathKind::Default)], Some(PathKind::Aggregate)),
+            StrId::LIB_STR_SLICE_L => (
+                vec![
+                    (0, PathKind::Aggregate),
+                    (1, PathKind::Default),
+                    (1, PathKind::Aggregate),
+                    (2, PathKind::Aggregate),
+                ],
+                None,
+            ),
             StrId::LIB_C_IS_EMPTY
             | StrId::LIB_C_COUNT
             | StrId::COUNT
@@ -1017,7 +1109,6 @@ fn get_special_argument_nodes(
             | StrId::LIB_STR_LENGTH
             | StrId::LIB_VEC_KEYS
             | StrId::LIB_STR_TO_INT
-            | StrId::LIB_MATH_ROUND
             | StrId::LIB_MATH_SUM
             | StrId::LIB_MATH_SUM_FLOAT
             | StrId::LIB_MATH_MIN
@@ -1039,9 +1130,56 @@ fn get_special_argument_nodes(
             | StrId::CTYPE_LOWER
             | StrId::SHA1
             | StrId::MD5
-            | StrId::DIRNAME
+            | StrId::NON_CRYPTO_MD5_LOWER
+            | StrId::NON_CRYPTO_MD5_UPPER
             | StrId::CRC32
-            | StrId::FILTER_VAR => (vec![(0, PathKind::Aggregate)], None),
+            | StrId::FILTER_VAR
+            | StrId::LIB_LOCALE_CREATE
+            | StrId::IS_A
+            | StrId::IS_BOOL
+            | StrId::IS_CALLABLE
+            | StrId::IS_CALLABLE_WITH_NAME
+            | StrId::IS_FINITE
+            | StrId::IS_FLOAT
+            | StrId::IS_INFINITE
+            | StrId::IS_INT
+            | StrId::IS_NAN
+            | StrId::IS_NULL
+            | StrId::IS_NUMERIC
+            | StrId::IS_OBJECT
+            | StrId::IS_RESOURCE
+            | StrId::IS_SCALAR
+            | StrId::IS_STRING
+            | StrId::CTYPE_ALNUM
+            | StrId::CTYPE_ALPHA
+            | StrId::CTYPE_DIGIT
+            | StrId::CTYPE_PUNCT
+            | StrId::CTYPE_SPACE
+            | StrId::CTYPE_UPPER
+            | StrId::CTYPE_XDIGIT
+            | StrId::IS_DICT
+            | StrId::IS_VEC
+            | StrId::IS_ANY_ARRAY
+            | StrId::IS_DICT_OR_DARRAY
+            | StrId::IS_VEC_OR_VARRAY
+            | StrId::ASIN
+            | StrId::ATAN2
+            | StrId::CEIL
+            | StrId::ABS
+            | StrId::DEG2RAD
+            | StrId::FLOOR
+            | StrId::CLASS_EXISTS
+            | StrId::LONG2IP
+            | StrId::RAD2DEG
+            | StrId::ROUND
+            | StrId::GETTYPE
+            | StrId::IS_FUN
+            | StrId::IS_PHP_ARRAY
+            | StrId::FUNCTION_EXISTS
+            | StrId::GET_PARENT_CLASS
+            | StrId::GET_RESOURCE_TYPE
+            | StrId::FLOATVAL
+            | StrId::TYPE_STRUCTURE_FN => (vec![(0, PathKind::Aggregate)], None),
             StrId::LIB_MATH_ALMOST_EQUALS
             | StrId::LIB_MATH_BASE_CONVERT
             | StrId::LIB_MATH_EXP
@@ -1056,6 +1194,10 @@ fn get_special_argument_nodes(
             | StrId::LIB_STR_ENDS_WITH
             | StrId::LIB_STR_ENDS_WITH_CI
             | StrId::LIB_STR_SEARCH
+            | StrId::LIB_STR_SEARCH_L
+            | StrId::LIB_STR_SEARCH_LAST
+            | StrId::LIB_STR_SEARCH_LAST_L
+            | StrId::LIB_STR_SEARCH_CI
             | StrId::LIB_STR_CONTAINS
             | StrId::LIB_STR_CONTAINS_CI
             | StrId::LIB_STR_COMPARE
@@ -1066,7 +1208,29 @@ fn get_special_argument_nodes(
             | StrId::SUBSTR_COUNT
             | StrId::STRCMP
             | StrId::STRNATCASECMP
-            | StrId::LIB_KEYSET_EQUAL => (vec![], Some(PathKind::Aggregate)),
+            | StrId::LIB_KEYSET_EQUAL
+            | StrId::LIB_DICT_EQUAL
+            | StrId::LIB_LEGACY_FIXME_EQ
+            | StrId::LIB_LEGACY_FIXME_LT
+            | StrId::LIB_LEGACY_FIXME_NEQ
+            | StrId::LIB_STR_LENGTH_L
+            | StrId::IS_SUBCLASS_OF
+            | StrId::STRIPOS
+            | StrId::STRLEN
+            | StrId::STRNATCMP
+            | StrId::STRNCMP
+            | StrId::STRRPOS
+            | StrId::STRSPN
+            | StrId::LEVENSHTEIN
+            | StrId::INTDIV
+            | StrId::STRCASECMP
+            | StrId::STRCSPN
+            | StrId::SUBSTR_COMPARE
+            | StrId::VERSION_COMPARE
+            | StrId::FMOD
+            | StrId::POW
+            | StrId::LIB_MATH_ROUND
+            | StrId::MB_DETECT_ENCODING => (vec![], Some(PathKind::Aggregate)),
             StrId::LIB_C_CONTAINS
             | StrId::LIB_C_CONTAINS_KEY
             | StrId::IN_ARRAY
@@ -1083,6 +1247,15 @@ fn get_special_argument_nodes(
                 ],
                 None,
             ),
+            StrId::PREG_MATCH_WITH_MATCHES_AND_ERROR => (
+                vec![
+                    (0, PathKind::Aggregate),
+                    (1, PathKind::Aggregate),
+                    (4, PathKind::Aggregate),
+                    (5, PathKind::Aggregate),
+                ],
+                None,
+            ),
             StrId::JSON_ENCODE | StrId::SERIALIZE => (vec![(0, PathKind::Serialize)], None),
             StrId::VAR_DUMP | StrId::PRINTF => {
                 (vec![(0, PathKind::Serialize)], Some(PathKind::Serialize))
@@ -1110,21 +1283,21 @@ fn get_special_argument_nodes(
                 None,
             ),
             StrId::PREG_GREP => (vec![(0, PathKind::Aggregate), (1, PathKind::Default)], None),
-            StrId::LIB_STR_REPLACE_EVERY => (
+            StrId::LIB_STR_REPLACE_EVERY | StrId::VSPRINTF | StrId::IMPLODE | StrId::JOIN => (
                 vec![
                     (0, PathKind::Default),
                     (1, PathKind::UnknownArrayFetch(ArrayDataKind::ArrayValue)),
                 ],
                 None,
             ),
-
             StrId::STR_PAD
             | StrId::LIB_STR_PAD_LEFT
             | StrId::LIB_STR_PAD_RIGHT
             | StrId::CHUNK_SPLIT
             | StrId::LIB_REGEX_REPLACE
             | StrId::LIB_STR_REPLACE
-            | StrId::LIB_STR_REPLACE_CI => (
+            | StrId::LIB_STR_REPLACE_CI
+            | StrId::STRTR => (
                 vec![
                     (0, PathKind::Default),
                     (1, PathKind::Aggregate),
@@ -1132,10 +1305,12 @@ fn get_special_argument_nodes(
                 ],
                 None,
             ),
-            StrId::IMPLODE | StrId::JOIN => (
+            StrId::LIB_STR_SPLICE => (
                 vec![
                     (0, PathKind::Default),
-                    (1, PathKind::UnknownArrayFetch(ArrayDataKind::ArrayValue)),
+                    (1, PathKind::Default),
+                    (2, PathKind::Aggregate),
+                    (3, PathKind::Aggregate),
                 ],
                 None,
             ),
@@ -1149,15 +1324,29 @@ fn get_special_argument_nodes(
                 ],
                 None,
             ),
+            StrId::LIB_VEC_FILL | StrId::EXPLODE | StrId::PREG_SPLIT => (
+                vec![
+                    (0, PathKind::Aggregate),
+                    (
+                        1,
+                        PathKind::UnknownArrayAssignment(ArrayDataKind::ArrayValue),
+                    ),
+                ],
+                None,
+            ),
             StrId::HTTP_BUILD_QUERY => (
                 vec![(0, PathKind::UnknownArrayFetch(ArrayDataKind::ArrayValue))],
                 None,
             ),
-            StrId::EXPLODE | StrId::PREG_SPLIT => (
-                vec![(
-                    1,
-                    PathKind::UnknownArrayAssignment(ArrayDataKind::ArrayValue),
-                )],
+            StrId::LIB_REGEX_SPLIT => (
+                vec![
+                    (
+                        0,
+                        PathKind::UnknownArrayAssignment(ArrayDataKind::ArrayValue),
+                    ),
+                    (1, PathKind::Aggregate),
+                    (2, PathKind::Aggregate),
+                ],
                 None,
             ),
             StrId::LIB_VEC_ZIP => (
@@ -1260,6 +1449,7 @@ fn get_special_argument_nodes(
             ),
             StrId::LIB_C_FIRST
             | StrId::LIB_C_FIRSTX
+            | StrId::LIB_C_NFIRST
             | StrId::LIB_C_LAST
             | StrId::LIB_C_LASTX
             | StrId::LIB_C_ONLYX
@@ -1306,16 +1496,20 @@ fn get_special_argument_nodes(
                 vec![(0, PathKind::UnknownArrayFetch(ArrayDataKind::ArrayKey))],
                 None,
             ),
-            StrId::LIB_DICT_MERGE | StrId::LIB_VEC_CONCAT | StrId::LIB_KEYSET_UNION => {
-                (vec![(0, PathKind::Default)], Some(PathKind::Default))
-            }
+            // handled separately
+            StrId::LIB_STR_FORMAT | StrId::SPRINTF => (vec![], None),
             _ => {
-                // if function_name.starts_with("HH\\Lib\\")
-                //     && !function_name.starts_with("HH\\Lib\\Math\\")
+                // if !matches!(functionlike_info.effects, FnEffect::Some(_))
+                //     && !matches!(functionlike_info.effects, FnEffect::Arg(_))
+                //     && !functionlike_info.pure_can_throw
+                //     && !functionlike_info.user_defined
                 // {
-                //     println!("no taints through {}", function_name);
+                //     println!("{}", functionlike_id.to_string(interner));
                 // }
-                (vec![], None)
+
+                // this is a cop-out, but will guarantee false-positives vs false-negatives
+                // in taint analysis
+                (vec![], Some(PathKind::Default))
             }
         },
         _ => panic!(),
diff --git a/src/analyzer/expr/fetch/array_fetch_analyzer.rs b/src/analyzer/expr/fetch/array_fetch_analyzer.rs
index a2ff3d71..fee9a5ff 100644
--- a/src/analyzer/expr/fetch/array_fetch_analyzer.rs
+++ b/src/analyzer/expr/fetch/array_fetch_analyzer.rs
@@ -100,7 +100,7 @@ pub(crate) fn analyze(
     if let Some(stmt_var_type) = stmt_var_type {
         // maybe todo handle access on null
 
-        let stmt_type = Some(get_array_access_type_given_offset(
+        let mut stmt_type_inner = get_array_access_type_given_offset(
             statements_analyzer,
             analysis_data,
             (expr.0, expr.1, pos),
@@ -109,30 +109,28 @@ pub(crate) fn analyze(
             false,
             &extended_var_id,
             context,
-        ));
+        );
 
-        if let Some(mut stmt_type) = stmt_type.clone() {
-            if let Some(keyed_array_var_id) = &keyed_array_var_id {
-                let can_store_result = context.inside_assignment || !stmt_var_type.is_mixed();
+        if let Some(keyed_array_var_id) = &keyed_array_var_id {
+            let can_store_result = context.inside_assignment || !stmt_var_type.is_mixed();
 
-                if !context.inside_isset && can_store_result && keyed_array_var_id.contains("[$") {
-                    context
-                        .vars_in_scope
-                        .insert(keyed_array_var_id.clone(), Rc::new(stmt_type.clone()));
-                }
+            if !context.inside_isset && can_store_result && keyed_array_var_id.contains("[$") {
+                context
+                    .vars_in_scope
+                    .insert(keyed_array_var_id.clone(), Rc::new(stmt_type_inner.clone()));
             }
+        }
 
-            add_array_fetch_dataflow(
-                statements_analyzer,
-                expr.0.pos(),
-                analysis_data,
-                keyed_array_var_id.clone(),
-                &mut stmt_type,
-                &mut used_key_type,
-            );
+        add_array_fetch_dataflow(
+            statements_analyzer,
+            expr.0.pos(),
+            analysis_data,
+            keyed_array_var_id.clone(),
+            &mut stmt_type_inner,
+            &mut used_key_type,
+        );
 
-            analysis_data.set_expr_type(pos, stmt_type.clone());
-        }
+        analysis_data.set_expr_type(pos, stmt_type_inner.clone());
     }
 
     if let Some(dim_expr) = expr.1 {
diff --git a/src/analyzer/stmt_analyzer.rs b/src/analyzer/stmt_analyzer.rs
index 7a7d15bc..c127abc1 100644
--- a/src/analyzer/stmt_analyzer.rs
+++ b/src/analyzer/stmt_analyzer.rs
@@ -294,17 +294,13 @@ fn detect_unused_statement_expressions(
             if let Some(functionlike_id) = functionlike_id {
                 match functionlike_id {
                     FunctionLikeIdentifier::Function(function_id) => {
-                        if function_id == StrId::INVARIANT
-                            || function_id == StrId::INVARIANT_VIOLATION
-                            || function_id == StrId::TRIGGER_ERROR
-                            || function_id == StrId::FUNCTION_EXISTS
-                            || function_id == StrId::CLASS_EXISTS
-                            || function_id == StrId::SET_FRAME_METADATA
-                            || function_id == StrId::LIB_C_FIRSTX
-                            || function_id == StrId::LIB_C_LASTX
-                            || function_id == StrId::LIB_C_ONLYX
+                        let codebase = statements_analyzer.get_codebase();
+
+                        if let Some(functionlike_info) = codebase
+                            .functionlike_infos
+                            .get(&(function_id, StrId::EMPTY))
                         {
-                            fn_can_throw = true;
+                            fn_can_throw = functionlike_info.pure_can_throw
                         }
                     }
                     FunctionLikeIdentifier::Method(_, method_name_id) => {
@@ -316,7 +312,7 @@ fn detect_unused_statement_expressions(
                             fn_can_throw = true;
                         }
                     }
-                    _ => {}
+                    _ => (),
                 }
             };
 
diff --git a/src/code_info_builder/lib.rs b/src/code_info_builder/lib.rs
index a7b8586f..94701e27 100644
--- a/src/code_info_builder/lib.rs
+++ b/src/code_info_builder/lib.rs
@@ -638,7 +638,20 @@ impl<'a> Scanner<'a> {
 
         functionlike_storage.is_production_code = self.file_source.is_production_code;
 
-        if name == Some(StrId::INVARIANT) {
+        if matches!(
+            name,
+            Some(
+                StrId::INVARIANT
+                    | StrId::INVARIANT_VIOLATION
+                    | StrId::TRIGGER_ERROR
+                    | StrId::FUNCTION_EXISTS
+                    | StrId::CLASS_EXISTS
+                    | StrId::SET_FRAME_METADATA
+                    | StrId::LIB_C_FIRSTX
+                    | StrId::LIB_C_LASTX
+                    | StrId::LIB_C_ONLYX
+            )
+        ) {
             functionlike_storage.pure_can_throw = true;
         }
 
diff --git a/src/str/build.rs b/src/str/build.rs
index ff9ef38d..ed56d190 100644
--- a/src/str/build.rs
+++ b/src/str/build.rs
@@ -26,6 +26,8 @@ fn main() -> Result<()> {
         "HH\\BuiltinEnumClass",
         "HH\\Container",
         "HH\\EnumClass\\Label",
+        "HH\\FIXME\\UNSAFE_CAST",
+        "HH\\Facts\\enabled",
         "HH\\FormatString",
         "HH\\Iterator",
         "HH\\KeyedContainer",
@@ -48,13 +50,17 @@ fn main() -> Result<()> {
         "HH\\Lib\\C\\last_key",
         "HH\\Lib\\C\\last_keyx",
         "HH\\Lib\\C\\lastx",
+        "HH\\Lib\\C\\nfirst",
         "HH\\Lib\\C\\onlyx",
         "HH\\Lib\\C\\search",
         "HH\\Lib\\Dict\\associate",
         "HH\\Lib\\Dict\\chunk",
         "HH\\Lib\\Dict\\contains",
         "HH\\Lib\\Dict\\contains_key",
+        "HH\\Lib\\Dict\\count_values",
         "HH\\Lib\\Dict\\diff_by_key",
+        "HH\\Lib\\Dict\\drop",
+        "HH\\Lib\\Dict\\equal",
         "HH\\Lib\\Dict\\fill_keys",
         "HH\\Lib\\Dict\\filter",
         "HH\\Lib\\Dict\\filter_async",
@@ -75,8 +81,10 @@ fn main() -> Result<()> {
         "HH\\Lib\\Dict\\reverse",
         "HH\\Lib\\Dict\\select_keys",
         "HH\\Lib\\Dict\\take",
+        "HH\\Lib\\Dict\\unique",
         "HH\\Lib\\Keyset\\chunk",
         "HH\\Lib\\Keyset\\diff",
+        "HH\\Lib\\Keyset\\drop",
         "HH\\Lib\\Keyset\\equal",
         "HH\\Lib\\Keyset\\filter",
         "HH\\Lib\\Keyset\\filter_async",
@@ -89,6 +97,10 @@ fn main() -> Result<()> {
         "HH\\Lib\\Keyset\\map_with_key",
         "HH\\Lib\\Keyset\\take",
         "HH\\Lib\\Keyset\\union",
+        "HH\\Lib\\Legacy_FIXME\\eq",
+        "HH\\Lib\\Legacy_FIXME\\lt",
+        "HH\\Lib\\Legacy_FIXME\\neq",
+        "HH\\Lib\\Locale\\create",
         "HH\\Lib\\Math\\INT32_MAX",
         "HH\\Lib\\Math\\abs",
         "HH\\Lib\\Math\\almost_equals",
@@ -121,6 +133,7 @@ fn main() -> Result<()> {
         "HH\\Lib\\Regex\\first_match",
         "HH\\Lib\\Regex\\matches",
         "HH\\Lib\\Regex\\replace",
+        "HH\\Lib\\Regex\\split",
         "HH\\Lib\\Str\\capitalize",
         "HH\\Lib\\Str\\capitalize_words",
         "HH\\Lib\\Str\\chunk",
@@ -135,6 +148,7 @@ fn main() -> Result<()> {
         "HH\\Lib\\Str\\is_empty",
         "HH\\Lib\\Str\\join",
         "HH\\Lib\\Str\\length",
+        "HH\\Lib\\Str\\length_l",
         "HH\\Lib\\Str\\lowercase",
         "HH\\Lib\\Str\\pad_left",
         "HH\\Lib\\Str\\pad_right",
@@ -142,8 +156,15 @@ fn main() -> Result<()> {
         "HH\\Lib\\Str\\replace",
         "HH\\Lib\\Str\\replace_ci",
         "HH\\Lib\\Str\\replace_every",
+        "HH\\Lib\\Str\\reverse",
         "HH\\Lib\\Str\\search",
+        "HH\\Lib\\Str\\search_ci",
+        "HH\\Lib\\Str\\search_l",
+        "HH\\Lib\\Str\\search_last",
+        "HH\\Lib\\Str\\search_last_l",
         "HH\\Lib\\Str\\slice",
+        "HH\\Lib\\Str\\slice_l",
+        "HH\\Lib\\Str\\splice",
         "HH\\Lib\\Str\\split",
         "HH\\Lib\\Str\\starts_with",
         "HH\\Lib\\Str\\starts_with_ci",
@@ -154,10 +175,12 @@ fn main() -> Result<()> {
         "HH\\Lib\\Str\\trim_left",
         "HH\\Lib\\Str\\trim_right",
         "HH\\Lib\\Str\\uppercase",
+        "HH\\Lib\\Vec\\cast_clear_legacy_array_mark",
         "HH\\Lib\\Vec\\chunk",
         "HH\\Lib\\Vec\\concat",
         "HH\\Lib\\Vec\\diff",
         "HH\\Lib\\Vec\\drop",
+        "HH\\Lib\\Vec\\fill",
         "HH\\Lib\\Vec\\filter",
         "HH\\Lib\\Vec\\filter_async",
         "HH\\Lib\\Vec\\filter_nulls",
@@ -176,19 +199,41 @@ fn main() -> Result<()> {
         "HH\\Lib\\Vec\\take",
         "HH\\Lib\\Vec\\unique",
         "HH\\Lib\\Vec\\zip",
+        "HH\\Lib\\_Private\\regex_match",
+        "HH\\Lib\\_Private\\validate_offset",
         "HH\\MemberOf",
+        "HH\\ReifiedGenerics\\get_classname",
+        "HH\\ReifiedGenerics\\get_type_structure",
         "HH\\Shapes",
         "HH\\Traversable",
         "HH\\TypeStructure",
         "HH\\Vector",
+        "HH\\class_meth_get_class",
+        "HH\\class_meth_get_method",
+        "HH\\darray",
         "HH\\dict",
+        "HH\\ffp_parse_string_native",
+        "HH\\fun_get_function",
         "HH\\global_get",
         "HH\\idx",
         "HH\\invariant",
         "HH\\invariant_violation",
+        "HH\\is_any_array",
+        "HH\\is_dict",
+        "HH\\is_dict_or_darray",
+        "HH\\is_fun",
+        "HH\\is_php_array",
+        "HH\\is_vec",
+        "HH\\is_vec_or_varray",
         "HH\\keyset",
+        "HH\\non_crypto_md5_lower",
+        "HH\\non_crypto_md5_upper",
         "HH\\set_frame_metadata",
+        "HH\\str_number_coercible",
+        "HH\\str_to_numeric",
         "HH\\type_structure",
+        "HH\\type_structure_for_alias",
+        "HH\\varray",
         "HH\\vec",
         "Hakana\\FindPaths\\Sanitize",
         "Hakana\\Immutable",
@@ -213,16 +258,33 @@ fn main() -> Result<()> {
         "__PHP_Incomplete_Class",
         "__Sealed",
         "__construct",
+        "abs",
         "addcslashes",
         "addslashes",
+        "array_combine",
+        "array_key_exists",
+        "array_keys",
+        "array_merge",
+        "array_push",
+        "array_reverse",
+        "array_shift",
+        "array_slice",
+        "array_unique",
+        "array_unshift",
+        "arsort",
+        "asin",
+        "asort",
         "assert",
         "assertAll",
         "at",
+        "atan2",
         "base64_decode",
         "base64_encode",
         "basename",
         "bin2hex",
+        "ceil",
         "chop",
+        "chr",
         "chunk_split",
         "class_exists",
         "coerce",
@@ -230,28 +292,54 @@ fn main() -> Result<()> {
         "convert_uuencode",
         "count",
         "crc32",
+        "ctype_alnum",
+        "ctype_alpha",
+        "ctype_digit",
         "ctype_lower",
+        "ctype_punct",
+        "ctype_space",
+        "ctype_upper",
+        "ctype_xdigit",
+        "curl_error",
         "date",
         "date_format",
         "debug_backtrace",
+        "decbin",
+        "dechex",
+        "deg2rad",
         "dirname",
         "echo",
         "escapeshellarg",
         "explode",
         "extension",
+        "fb_serialize",
         "file_get_contents",
         "filename",
         "filter_var",
+        "floatval",
+        "floor",
+        "fmod",
         "fromItems",
         "function_exists",
         "get_class",
         "get_object_vars",
+        "get_parent_class",
+        "get_resource_type",
+        "gethostname",
+        "getrandmax",
+        "gettype",
+        "gzcompress",
+        "gzdecode",
+        "gzdeflate",
         "gzinflate",
+        "gzuncompress",
         "hash",
         "hash_equals",
         "hash_hmac",
         "hex2bin",
+        "hexdec",
         "highlight_string",
+        "hphp_to_string",
         "htmlentities",
         "htmlentitydecode",
         "htmlspecialchars",
@@ -261,51 +349,98 @@ fn main() -> Result<()> {
         "implode",
         "in_array",
         "include",
+        "inet_ntop",
+        "inet_pton",
+        "intdiv",
+        "interface_exists",
         "intval",
         "ip2long",
+        "is_a",
+        "is_bool",
+        "is_callable",
+        "is_callable_with_name",
+        "is_finite",
+        "is_float",
+        "is_infinite",
+        "is_int",
+        "is_nan",
+        "is_null",
+        "is_numeric",
+        "is_object",
+        "is_resource",
+        "is_scalar",
+        "is_string",
+        "is_subclass_of",
         "isset",
         "join",
         "json_decode",
+        "json_decode_with_error",
         "json_encode",
         "keyExists",
+        "krsort",
+        "ksort",
         "lcfirst",
+        "levenshtein",
         "log",
+        "long2ip",
         "ltrim",
+        "lz4_compress",
+        "lz4_uncompress",
+        "max",
+        "mb_detect_encoding",
+        "mb_list_encodings",
         "mb_strlen",
         "mb_strtolower",
         "mb_strtoupper",
         "md5",
+        "method_exists",
         "microtime",
+        "min",
         "mktime",
+        "mt_getrandmax",
+        "mysql_escape_string",
         "nl2br",
         "number_format",
         "ord",
+        "pack",
         "parent",
         "password_hash",
         "pathinfo",
+        "pow",
         "preg_filter",
         "preg_grep",
         "preg_match",
+        "preg_match_all",
         "preg_match_all_with_matches",
+        "preg_match_with_error",
         "preg_match_with_matches",
+        "preg_match_with_matches_and_error",
         "preg_quote",
         "preg_replace",
         "preg_replace_with_count",
         "preg_split",
         "print_r",
+        "print_r_pure",
         "printf",
         "quote_meta",
         "quoted_printable_decode",
         "quoted_printable_encode",
+        "rad2deg",
         "rand",
         "range",
+        "rawurldecode",
         "rawurlencode",
         "realpath",
         "removeKey",
+        "round",
+        "rsort",
         "rtrim",
         "self",
         "serialize",
         "sha1",
+        "socket_strerror",
+        "sort",
+        "sprintf",
         "sscanf",
         "static",
         "stdClass",
@@ -316,25 +451,37 @@ fn main() -> Result<()> {
         "str_rot13",
         "str_shuffle",
         "str_split",
+        "str_word_count",
+        "strcasecmp",
         "strchr",
         "strcmp",
+        "strcspn",
+        "stream_get_meta_data",
         "strgetcsv",
         "strip_tags",
         "stripcslashes",
+        "stripos",
         "stripslashes",
         "stristr",
+        "strlen",
         "strnatcasecmp",
+        "strnatcmp",
+        "strncmp",
         "strpad",
         "strpbrk",
         "strpos",
         "strrchr",
         "strrev",
+        "strrpos",
+        "strspn",
         "strstr",
         "strtolower",
         "strtotime",
         "strtoupper",
+        "strtr",
         "strval",
         "substr",
+        "substr_compare",
         "substr_count",
         "substr_replace",
         "this",
@@ -344,12 +491,15 @@ fn main() -> Result<()> {
         "trim",
         "ucfirst",
         "ucwords",
+        "unpack",
         "unset",
         "urldecode",
         "urlencode",
+        "utf8_decode",
         "utf8_encode",
         "var_dump",
         "var_export",
+        "version_compare",
         "vsprintf",
         "wordwrap",
     ];