From e32378e660a4b4c7b99eff928f3e55c4467261f9 Mon Sep 17 00:00:00 2001 From: PedroDnT Date: Wed, 24 Jul 2024 18:10:22 -0300 Subject: [PATCH] chore: Refactor analyze_model_performance function for better performance --- .env_example | 3 + __pycache__/call.cpython-312.pyc | Bin 10553 -> 17471 bytes __pycache__/utils.cpython-312.pyc | Bin 10355 -> 10355 bytes call.py | 169 +++++++++++++++++++++++++++--- utils.py | 5 +- 5 files changed, 161 insertions(+), 16 deletions(-) create mode 100644 .env_example diff --git a/.env_example b/.env_example new file mode 100644 index 0000000..ab6b016 --- /dev/null +++ b/.env_example @@ -0,0 +1,3 @@ +db_connection_string = +OPENAI_API_KEY= +OPENROUTER_API_KEY= \ No newline at end of file diff --git a/__pycache__/call.cpython-312.pyc b/__pycache__/call.cpython-312.pyc index 6efbdbf46a94171efc78e46883b8cb0e4aeddb5f..bba3c1d857434874412acecd3f09dca5274e2b67 100644 GIT binary patch delta 7737 zcmb6;Yfu~4db`p}NJv6LfGqLy0^%X_umPK3jGcga_(>d-;0Iu3(Joj-NMv`#LtRdbDi|=pl{Ef^PTT}=R4o)eEj_UKP=+~pBW7K7(CxN-1=Z@gcB2rr;@ zjEbqW!3b?)D#P8lO{=DhUm-5n(k0kBD`&kOMxKpCf(V%$RqIqo0t7=+7}{eiX)$$ zWmxLNpMCfN!;!~*Uf%8XQ_x>eHViR5c{0V$B{3(Qc!6+^LJ z80W|thG*Tb6ju^!mk_@!!_C4RK=yMy8Gc){(Az}LyJ6cDIVm&#J2?GbM(>npoADk= zOcD*mAMF44B@^z=UoCD*vBA(1IhW7n=9ut*=Qo)_+$pC0z&*1-x%e=tYi%I=e5BXM zliW-0S&)JoB+rrvh4L@P8k>BDX!C;U9=qC6hb&6CrJ zQF)>8i~2WA6<{PwSdt)3nxzvDd+|G3gI$Y1y3=Agq$s=sgJNQRfLna1#8n+$`o3mIjICfqQ2%UMDWg1DUKVOzLkc*{TdD3d9CaH| zS5`FQy?0JkI26iGK_9+X*)gOy&BvW){yPB=Ae-;OD}=(NP^NsoB6$&c2W36 zvWl_G-`O`^EcvAp1cN3)IsIfDi-c471e`t6s9C_nWL0rUPzZU`j_fjWi?-6X_Y|ow zLGyzc+8@Nw2?oI^n1n*XOqafupB7gMhGQ#=8~p^vH_A&-H-%rTs%j?XZ@yY?O)Bgd ztAbMiNnXNk{M9_RpuB|5@=%9nMK zFL;@SS;obKLC#Q9UN`TjWmBsU=<2~XS#l9noeWr8$YF*D7b=-qy-c#cg`EP1a!7)K zBT^+PT9ZCM`1lN&@agKsqDOh>C@NPMYE=_H_7vYRd7saG5<=G!3KdO9*%X za76&Z2duX6NA~iXKY@?isMiB0x`b_;iE0W?Z1!dO1G`` zTh`{7wRw$-TMukmJ7U(3xV3Zn_l2(RwuMj*f2V#OE)H!?%_oEzv>QzKDHe@ z*`oNgrSOzO^^Z6X(3RFV;1t7D@B{CGJ*WiZyM;|*L4|kda6a~A> zN8x3V@k(0Ekq|Z@WpGoTOeO%4S5h*HgjmCqzDe?UpY!<0S<(fnpUN{pL#80=BW>n# zf_D!)#dS&aQ!|_rfrc|cCN%2y^5c#!5`BD!CIp*V*5_ilG&QeNB26~HB$zYgpP7IV z37pLMU^dIRAhM-aM{z2h-bhR{l5ZvOCMmr_zU!vh5U77x}ycPw3rK zULVUi-CmkmaA+hmX6xa+SriCaWPL31_Uw-VB-FsUi{jazz=tD`MTW{YqX8r333;43 zO9CqGbj{7MRR{+ZA4@3cNe-zh(hhjoI@DPKr|isVNDc+*WZAa>=bz!nAr}BK+k)+} zEj@eBTyY=ESM}k`>K%XN zj6agS2u#zUX|xtR5ilGC4*<&{#vXg-Fb=pe7$td@ji)K7VwaQul=??ip=1 zXRDmF!Y-vi3W03s!9f7Gu&j3$F!|J`(-9#YRp@0eiw`*ztQn@TK>%RHIL9HaNt=`j z3>p##cYtwnaQE>}1st>B)SMJer-N@xx5o$RF?X!xan!GX3|9hR3EMGRZx_BQY#FO! z#;UlnM%2`Ro}a?7U0XnbN!o%2ILO_?bVdOIBmK}g4I2KQyd_Fa7SPJ?b`iAdaV!XB zySobMyxguJCWN0Usz5PFGZe@h+c&mPa5`BE77h@o9k7gzHjG{w8y|0UjE#?>renNu z%n?wGwLZdM3}{Ddj$IpTJ}Qhmlfo5?2nX0 ziMa7-rfY@1-Mb1swb>reYHw%_x|)}XZDY~vf$M=+1yNHbIj3D`h622FW=mlFe~RA% zd!}(@>OmY@3aO>g=Bgu}lZ=4dV6`;BIh<1oN?LLfT&0w{gn$Tm7|2l`%WE=TW?C(w z>;l8!T2f?am9#jK&4Xw)qDAMuopVXa8P)JwnUp}hho{!Z9$P%EZ^-p!lN*Gdn)*cS ze1V`bT37lQy(x^=|2d9wuRiQ`wM#LDxfJ72lL@L7A?$jApq2^WRNT-LSTHZBrm>(# zAi(cJ%nNE22un*F&|YQSV7`zq+pN1_CV|3XtNEk;_R?o{VZD<~=!FZEQ z6fTK&9LwRfG{fod!Kh1d2|~1d#RU2&T1XGje=2Cmp&u&GE(3F09EC2%bA*lUjvOg%U@u>VJ-$CU(l|`W> z(A>vN_@}z$LSUvY2^C7#4?&nkwJBCko*&^|0rm5XvygcQ3bWb08)}&xbeV>!nOP45 zSxCssL#p3JvJSVU&=ko0?O$KVu_+6}T|2dYK_PYZsY{N3ncQK-_27K!0cngze7@iqMkX* z?UArkg-^8Ey=lqy%QaEfH=mJJ2(Df15yXVLxwWmiy_Ic4AC>7j$NelTUxl&^Gz658 zYGnw+{^#62sP;<5hYXtyf$FnvZuid8M=YOH7a0m@n#fBq2k7nZXIVHxxMa*RtW>H(UbcKEb|*E$`lr_T)bJatll=7S30i$ z!7`B~I<=)c75gKWh$dcfczNiHqAIbrbG;x|J0yqenyKE0_wv!gFu8F=fUNFIon zwTh)}qPcxje-L`b;@ZD(iTk?Zbw|Y7Zqa;nQ-4hAwu`lgA`_99#oEK7y-PG7+0=L6 zCqe`X<=6709y^6!#(!OPIXV}sZi{q$Ufm^DT^99~0PmIUT|TpIZMcuA45cf?7j+#G zK6)zl)L^{s1l&Hii|WLE-FJ(+w`=NG&g>N1#Paq?U$iuKuup977fS~=iwD0bwk35i zJ6V94%0*+1wD*Dax%JRh(emu3>73NjD4x3f>!#6l``Yu7eevd_cbi7V#xc<{zG=FW z5soB__Q*h_GujsQM6ZbEGn@Le_v{C@><44^gHa+f7afT><8}~S)pl*`R_&o!?V)IK zl=`_Ps##Zy9fR@O6QaIaA{1(i9E{XOm16gE;*s;Bd3aO*yd*BPg~IpID(&LaFNnuR zMDvAB{l$AmyJ&0@4-AM0h9sW5q8-tO^@fdu8+D@PxlPmg45LXxWvJX~fTKJfZ|DkAtT;>9uX!nkO;vT1UrSNmf_QUzGQeVC7#Y%l?|52-@)+T_~B$i*lf z9a%s8Ps8GK=RX_SI3b!}*wl~g6qVfSxEbpB>Dk}HTy5ziZV@czrw0d*V!u@MRKn|5 zq;XJ5{A#}y+8=5TAlRuN)Z~9epq`H^dR6fH*lrm#U>|p>q5TPtBdE+r(6Dc?OZmxD zQqN%=wY!w4|ETuFQRRlZ=|rb;qYV$Ck4_ap$??K%j-VFYT*CIm%d`|k69l?i!d+z+ z+55`~umDgQnVOyTEG9~k(ZYZ5X5k+|PV`?G=ft8jRg+2Nr7qu`L&d(0;t%_m2z&q_ zp#prylX4*FZik%5Gby)We9S+vzku=ogdZ0M0B+!ZF|M-TSLdtBA6mOrrO9hJ*4~3} zwGG7D2E@Tpal$3G4aD2%r6IAXB}VLr|BzHRO{|@WRUF!?I1;Nkve`Wnueh)@w9*kH zN@3L84kxKPhcBzPb*7b}t>S&L;(d2@`?l?EkwY>2;Vt{qG5gc&1#$bc%R{#h-PKpY zK>~hKgXv2}!X{mP9QEPW`gct~1|AbCuYYECkzIp!_&@5;wxEk1`wRle)2AXt6KYAt zFI449RlwdttzU#MHIFr2JaBbwg(uzlc(^LIBo{ht%-1vhIg+d zG4#FeO1(E|ad7fK6rS1jI*IMIz4kgG8z(Vd8`~uO*x^qir36R++<3eSJ_>L4b z<~t!|L+@+h_v<8N{ND9FwEbxT;72eH@(`_TYefZ*&0VC%1OQG`0OYAWt<&gwpU=1v zXr^E{x*hDdDp0oPSu`B%h7L3=#$g+}EcS)jyi04x=c}34(O@#Gb^gbl=%u(32GQk! zmvfP1*-bF-*6Ga0DpymvhHCfJe2)$@VI+mxV1!g~d1d|{-X$@;IrD$5rLOJ%IM1%L zx=VNKobHi*w`})deLVz4>T0J2XtBBvKxQ-U@#|FthaGZ3Adz(~Nhv>4>DHE(8V|L8qM3&II4~l_y(1tpNQ8m{Q-|$55BDeSoZLRp=E#1l0RPY#~1y5t00Qp zu{ZubpBIOh_*jLH-QBpz_b%~?3ZM9w-|_tTTnjq7`L%U#l(DZJLwkF+^H1aZ2CdYB zl^Yb8g?5NB7DjIe=3SR?qbj3JO-#;IDP5B-zWS1O^F22=o#lZ$kZ!z%|sJ z6l}LJC0`_Gpk#9NTo>F`4kT6qp3VJm2tv=@V8c6i delta 38 rcmewy@Hv3@G%qg~0}yciUXUiTk=Ig{@y2F%RTX9q8AjnEbD#tO ChatPromptTemplate: 2. Start directly with the analysis sections as outlined below. 3. Provide all sections in the exact order and format specified. 4. Use at least 5 years of historical data prior to the target year for your analysis. - + 5. Analyze both income statements and balance sheets in your prediction. + 6. Focus on predicting the 'Resultado Líquido das Operações Continuadas' (Net Income from Continuing Operations) as the main earnings metric. + Your response must follow this exact structure: - Panel A ||| [Trend Analysis: Analyze relevant trends over at least the past five years.] + Panel A ||| [Trend Analysis: Analyze relevant trends over at least the past five years, with a focus on 'Resultado Líquido das Operações Continuadas'.] Panel B ||| [Ratio Analysis: Calculate and analyze key financial ratios over at least the past five years, interpreting their implications for future earnings.] - Panel C ||| [Rationale: Summarize your analyses and explain your prediction reasoning concisely, considering the long-term trends.] + Panel C ||| [Rationale: Summarize your analyses and explain your prediction reasoning concisely, considering the long-term trends and focusing on 'Resultado Líquido das Operações Continuadas'.] Direction ||| [increase/decrease] Magnitude ||| [large/moderate/small] Confidence ||| [0.00 to 1.00] Additional guidelines: - - Be precise and focused in your explanations. - - For Magnitude, use only one of these words: large, moderate, or small. + - Be precise, focused and cocise in your explanations. + - For Magnitude, you must use exactly one of these words: large, moderate, or small. Do not skip this or use any other terms. - For Confidence, provide a single number between 0.00 and 1.00. - Do not include formulas or calculations in your response. - Use '|||' as a delimiter between section headers and content. - Ensure your analysis covers at least 5 years of historical data. - + - Return responses in English. + - No need to define fomulas or calculations in your response. Just mention the ratio or the value by name. + - When referring to earnings, always use 'Resultado Líquido das Operaçes Continuadas' as the key metric, but call it just earnings. Financial data: {financial_data} Target year: {target_year} @@ -71,8 +75,6 @@ def create_prompt_template() -> ChatPromptTemplate: return ChatPromptTemplate.from_template(template) def get_financial_prediction(financial_data: Dict[str, Any], n_years: int) -> Dict[int, Any]: - - """Calls the prompt template and returns the entire response in a dictionary for a given CD_CVM.""" try: print("Starting get_financial_prediction...") @@ -105,11 +107,11 @@ def get_financial_prediction(financial_data: Dict[str, Any], n_years: int) -> Di filtered_financial_data = { key: [ [{k: v for k, v in item.items() if k == 'DS_CONTA' or (k.startswith('20') and data_from <= int(k.split('-')[0]) <= data_up_to)} - for item in statement] - for statement in value - ] - for key, value in financial_data.items() - } + for item in statement] + for statement in value + ] + for key, value in financial_data.items() + } prompt = prompt_template.format(financial_data=filtered_financial_data, target_year=year) prompts.append(prompt) @@ -164,7 +166,14 @@ def parse_financial_prediction(prediction_dict: Dict[int, Any]) -> pd.DataFrame: direction = 1 if direction_match and 'increase' in direction_match.group(1).lower() else -1 magnitude_match = re.search(r'Magnitude \|\|\| (\w+)', text, re.IGNORECASE) - magnitude = magnitude_match.group(1).lower() if magnitude_match else 'N/A' + if magnitude_match: + magnitude = magnitude_match.group(1).lower() + if magnitude not in ['large', 'moderate', 'small']: + print(f"Warning: Unexpected magnitude value '{magnitude}' for year {year}. Setting to 'moderate'.") + magnitude = 'moderate' + else: + print(f"Warning: No magnitude found for year {year}. Setting to 'moderate'.") + magnitude = 'moderate' confidence_match = re.search(r'Confidence \|\|\| (\d+\.\d+)', text, re.IGNORECASE) try: @@ -191,4 +200,134 @@ def parse_financial_prediction(prediction_dict: Dict[int, Any]) -> pd.DataFrame: 'Model Name': model_name }) - return pd.DataFrame(parsed_data) \ No newline at end of file + return pd.DataFrame(parsed_data) + +def get_financial_prediction_list(CD_CVM_list: List[int], n_years: int) -> pd.DataFrame: + """ + Generates financial predictions for a list of CD_CVM codes and target years. + + Args: + CD_CVM_list (List[int]): List of CD_CVM codes to process. + n_years (int): Number of most recent years to predict for each CD_CVM code. + + Returns: + pd.DataFrame: A DataFrame containing predictions for all CD_CVM codes and target years. + """ + all_predictions = [] + + for cd_cvm in CD_CVM_list: + print(f"Processing CD_CVM: {cd_cvm}") + financial_data = get_financial_data([cd_cvm]) + predictions = get_financial_prediction(financial_data, n_years) + + if predictions: + df = parse_financial_prediction(predictions) + df['CD_CVM'] = cd_cvm + all_predictions.append(df) + else: + print(f"No predictions generated for CD_CVM: {cd_cvm}") + + if all_predictions: + return pd.concat(all_predictions, ignore_index=True) + else: + return pd.DataFrame() + +def post_added_data(predictions_df: pd.DataFrame) -> pd.DataFrame: + """ + Adds an actual_earnings_direction column and a NAME column to the predictions DataFrame. + + Args: + predictions_df (pd.DataFrame): DataFrame returned by get_financial_prediction_list + + Returns: + pd.DataFrame: Updated DataFrame with actual_earnings_direction and NAME columns + """ + def normalize_string(s): + return unidecode(s).lower() + + def strip_markdown(text): + # Remove bold and italic markers + text = re.sub(r'\*\*|__', '', text) + text = re.sub(r'\*|_', '', text) + # Remove links + text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) + # Remove backticks + text = re.sub(r'`', '', text) + # Remove any remaining special characters + text = re.sub(r'[#>~\-=|]', '', text) + return text.strip() + + def get_actual_direction(row): + cd_cvm = row['CD_CVM'] + year = row['Year'] + + try: + financial_data = get_financial_data([cd_cvm]) + if not financial_data or 'income_statements' not in financial_data or not financial_data['income_statements']: + print(f"No financial data found for CD_CVM: {cd_cvm}") + return np.nan + + income_statement = financial_data['income_statements'][0] + + print(f"Debug: Income statement structure for CD_CVM {cd_cvm}:") + print(f"Type: {type(income_statement)}") + print(f"Number of items: {len(income_statement)}") + print(f"Sample content: {income_statement[:2]}") + + earnings_metrics = [ + 'Resultado Liquido das Operacoes Continuadas', + 'Lucro/Prejuizo Consolidado do Periodo', + 'Lucro/Prejuizo do Periodo' + ] + + normalized_metrics = [normalize_string(metric) for metric in earnings_metrics] + + earnings_row = None + for item in income_statement: + normalized_ds_conta = normalize_string(item['DS_CONTA']) + if normalized_ds_conta in normalized_metrics: + earnings_row = item + print(f"Using earnings metric: {item['DS_CONTA']}") + break + + if earnings_row is None: + print(f"No suitable earnings metric found for CD_CVM: {cd_cvm}") + print(f"Available metrics: {[item['DS_CONTA'] for item in income_statement]}") + return np.nan + + print(f"Debug: Earnings row for CD_CVM {cd_cvm}: {earnings_row}") + + current_year_earnings = earnings_row.get(f'{year}-12-31') + previous_year_earnings = earnings_row.get(f'{year-1}-12-31') + + print(f"Debug: Current year earnings ({year}): {current_year_earnings}") + print(f"Debug: Previous year earnings ({year-1}): {previous_year_earnings}") + + if current_year_earnings is None or previous_year_earnings is None: + print(f"Missing earnings data for CD_CVM: {cd_cvm}, Year: {year}") + return np.nan + + try: + current_year_earnings = float(current_year_earnings) + previous_year_earnings = float(previous_year_earnings) + except ValueError: + print(f"Error converting earnings to float for CD_CVM: {cd_cvm}, Year: {year}") + return np.nan + + return 1 if current_year_earnings > previous_year_earnings else -1 + except Exception as e: + print(f"Error processing CD_CVM: {cd_cvm}, Year: {year}. Error: {str(e)}") + return np.nan + + # Apply the function to each row + predictions_df['actual_earnings_direction'] = predictions_df.apply(get_actual_direction, axis=1) + + # Add the NAME column + predictions_df['NAME'] = predictions_df['CD_CVM'].apply(get_company_name_by_cd_cvm) + + # Strip markdown from Panel A, B, and C + for panel in ['Panel A', 'Panel B', 'Panel C']: + if panel in predictions_df.columns: + predictions_df[panel] = predictions_df[panel].apply(strip_markdown) + + return predictions_df \ No newline at end of file diff --git a/utils.py b/utils.py index d780852..7faf2fa 100644 --- a/utils.py +++ b/utils.py @@ -149,6 +149,7 @@ def get_company_name_by_cd_cvm(cd_cvm): return None def analyze_model_performance(df): + grouped = df.groupby(['Model', 'Company']) # Initialize an empty DataFrame to store the results @@ -176,4 +177,6 @@ def analyze_model_performance(df): }]) results = pd.concat([results, current_results], ignore_index=True) - return results \ No newline at end of file + return results + +#get \ No newline at end of file