2024
|
Nachtegael, Charlotte Active learning for biomedical relation extraction, the oligogenic use case PhD Thesis 2024. @phdthesis{nokey,
title = {Active learning for biomedical relation extraction, the oligogenic use case},
author = {Nachtegael, Charlotte},
url = {https://difusion.ulb.ac.be/vufind/Record/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/375304/Holdings},
year = {2024},
date = {2024-06-28},
abstract = {In a context where technological advancements have enabled increased availability of genetic data through high-throughput sequencing technologies, the complexity of genetic diseases has become increasingly apparent. Oligogenic diseases, characterised by a combination of genetic variants in two or more genes, have emerged as a crucial research area, challenging the traditional model of "one genotype, one phenotype". Thus, understanding the underlying mechanisms and genetic interactions of oligogenic diseases has become a major priority in biomedical research. This context underlines the importance of developing dedicated tools to study these complex diseases.Our first major contribution, OLIDA, is an innovative database designed to collect data on variant combinations responsible for these diseases, filling significant gaps in the current knowledge, focused up until now on the digenic diseases. This resource, accessible via a web platform, adheres to FAIR principles and represents a significant advancement over its predecessor, DIDA, in terms of data curation and quality assessment.Furthermore, to support the biocuration of oligogenic diseases, we used active learning to construct DUVEL, a biomedical corpus focused on digenic variant combinations. To achieve this, we first investigated how to optimise these methods across numerous biomedical relation extraction datasets and developed a web-based platform, ALAMBIC, for text annotation using active learning. Our results and the quality of the corpus obtained demonstrate the effectiveness of active learning methods in biomedical relation annotation tasks.By establishing a curation pipeline for oligogenic diseases, as well as a standards for integrating active learning methods into biocuration, our work represents a significant advancement in the field of biomedical natural language processing and the understanding of oligogenic diseases.
},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
In a context where technological advancements have enabled increased availability of genetic data through high-throughput sequencing technologies, the complexity of genetic diseases has become increasingly apparent. Oligogenic diseases, characterised by a combination of genetic variants in two or more genes, have emerged as a crucial research area, challenging the traditional model of "one genotype, one phenotype". Thus, understanding the underlying mechanisms and genetic interactions of oligogenic diseases has become a major priority in biomedical research. This context underlines the importance of developing dedicated tools to study these complex diseases.Our first major contribution, OLIDA, is an innovative database designed to collect data on variant combinations responsible for these diseases, filling significant gaps in the current knowledge, focused up until now on the digenic diseases. This resource, accessible via a web platform, adheres to FAIR principles and represents a significant advancement over its predecessor, DIDA, in terms of data curation and quality assessment.Furthermore, to support the biocuration of oligogenic diseases, we used active learning to construct DUVEL, a biomedical corpus focused on digenic variant combinations. To achieve this, we first investigated how to optimise these methods across numerous biomedical relation extraction datasets and developed a web-based platform, ALAMBIC, for text annotation using active learning. Our results and the quality of the corpus obtained demonstrate the effectiveness of active learning methods in biomedical relation annotation tasks.By establishing a curation pipeline for oligogenic diseases, as well as a standards for integrating active learning methods into biocuration, our work represents a significant advancement in the field of biomedical natural language processing and the understanding of oligogenic diseases.
|
Versbraegen, Nassim Discovering multivariant pathogenic patterns among patients with rare diseases PhD Thesis 2024. @phdthesis{nokey,
title = {Discovering multivariant pathogenic patterns among patients with rare diseases},
author = {Versbraegen, Nassim},
url = {https://difusion.ulb.ac.be/vufind/Record/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/375378/Holdings},
year = {2024},
date = {2024-06-24},
abstract = {Increasing evidence points to the complex interplay of multiple genetic variants as a major contributing factor in many human diseases. Oligogenic diseases, in which a small set of genes collaborate to cause a pathology, present a compelling example of this phenomenon and necessitate a shift away from traditional single-gene inheritance models. Our work aimed to develop robust methods for pinpointing pathogenic combinations of genetic variants across patient cohorts, ultimately improving disease understanding and potentially guiding future diagnostic approaches.We began by developing a novel machine learning framework that integrates explainable AI (XAI) techniques and game-theoretic concepts. This framework allows us to classify and characterise different types of oligogenic effects, providing insights into the specific mechanisms by which multiple genes interact to drive disease. Next, we focused on refining existing computational methods used to predict the pathogenicity of variant combinations. Our emphasis was two-fold: improving computational efficiency for handling the expansive datasets associated with cohort analysis, and critically, reducing false-positive rates to ensure the reliability of our results. With these tools in hand, we developed a specialised cohort analysis approach tailored to investigating diseases with complex genetic origins. To demonstrate the capabilities of our methodology, we delved into a Marfan syndrome cohort. Marfan syndrome is a hereditary condition affecting the body's connective tissue. Our analysis successfully uncovered potential modifier mutations that appear to interact with the primary disease-causing variant, offering new clues about the intricate genetic landscape of this condition.
},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Increasing evidence points to the complex interplay of multiple genetic variants as a major contributing factor in many human diseases. Oligogenic diseases, in which a small set of genes collaborate to cause a pathology, present a compelling example of this phenomenon and necessitate a shift away from traditional single-gene inheritance models. Our work aimed to develop robust methods for pinpointing pathogenic combinations of genetic variants across patient cohorts, ultimately improving disease understanding and potentially guiding future diagnostic approaches.We began by developing a novel machine learning framework that integrates explainable AI (XAI) techniques and game-theoretic concepts. This framework allows us to classify and characterise different types of oligogenic effects, providing insights into the specific mechanisms by which multiple genes interact to drive disease. Next, we focused on refining existing computational methods used to predict the pathogenicity of variant combinations. Our emphasis was two-fold: improving computational efficiency for handling the expansive datasets associated with cohort analysis, and critically, reducing false-positive rates to ensure the reliability of our results. With these tools in hand, we developed a specialised cohort analysis approach tailored to investigating diseases with complex genetic origins. To demonstrate the capabilities of our methodology, we delved into a Marfan syndrome cohort. Marfan syndrome is a hereditary condition affecting the body's connective tissue. Our analysis successfully uncovered potential modifier mutations that appear to interact with the primary disease-causing variant, offering new clues about the intricate genetic landscape of this condition.
|
Abels, Axel Resolving Knowledge Limitations for Improved Collective Intelligence: A novel online machine learning approach PhD Thesis 2024. @phdthesis{nokey,
title = {Resolving Knowledge Limitations for Improved Collective Intelligence: A novel online machine learning approach},
author = {Abels, Axel},
url = {https://difusion.ulb.ac.be/vufind/Record/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/373334/Holdings},
year = {2024},
date = {2024-04-23},
urldate = {2024-04-23},
abstract = {One of the reasons human groups struggle to make the best decisions is that they are inherently biased in their beliefs. In essence, our perception of what is true is often distorted by individual and social biases, including stereotypes. When individuals deliberate about a decision, they tend to transmit these beliefs to others, thereby steering the entire group away from the best decision. For example, a senior doctor could spread a misinterpretation of symptoms to junior doctors, resulting in inappropriate treatments. The primary objective of this thesis is to mitigate the impact of such biases on group decision-making in domains such as medical diagnostics, policy-making, and crowdsourced fact-checking. We propose to achieve this by having humans interact through a collective decision-making platform in charge of handling the aggregation of group knowledge. The key hypothesis here is that by carefully managing the collectivization of knowledge through this platform, it will be substantially harder for humans to impose their biases on the final decision. The core of our work involves the development and analysis of algorithms for decision-making systems. These algorithms are designed to effectively aggregate diverse expertise while addressing biases. We thus focus on aggregation methods that use online learning to foster collective intelligence more effectively. In doing so, we take into account the nuances of individual expertise and the impact of biases, aiming to filter out noise and enhance the reliability of collective decisions. Our theoretical analysis of the proposed algorithms is complemented by rigorous testing in both simulated and online experimental environments to validate the system’s effectiveness. Our results demonstrate a significant improvement in performance and reduction in bias influence. These findings not only highlight the potential of technology-assisted decision-making but also underscore the value of addressing human biases in collaborative environments.
},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
One of the reasons human groups struggle to make the best decisions is that they are inherently biased in their beliefs. In essence, our perception of what is true is often distorted by individual and social biases, including stereotypes. When individuals deliberate about a decision, they tend to transmit these beliefs to others, thereby steering the entire group away from the best decision. For example, a senior doctor could spread a misinterpretation of symptoms to junior doctors, resulting in inappropriate treatments. The primary objective of this thesis is to mitigate the impact of such biases on group decision-making in domains such as medical diagnostics, policy-making, and crowdsourced fact-checking. We propose to achieve this by having humans interact through a collective decision-making platform in charge of handling the aggregation of group knowledge. The key hypothesis here is that by carefully managing the collectivization of knowledge through this platform, it will be substantially harder for humans to impose their biases on the final decision. The core of our work involves the development and analysis of algorithms for decision-making systems. These algorithms are designed to effectively aggregate diverse expertise while addressing biases. We thus focus on aggregation methods that use online learning to foster collective intelligence more effectively. In doing so, we take into account the nuances of individual expertise and the impact of biases, aiming to filter out noise and enhance the reliability of collective decisions. Our theoretical analysis of the proposed algorithms is complemented by rigorous testing in both simulated and online experimental environments to validate the system’s effectiveness. Our results demonstrate a significant improvement in performance and reduction in bias influence. These findings not only highlight the potential of technology-assisted decision-making but also underscore the value of addressing human biases in collaborative environments.
|
Verhelst, Theo Causal and predictive modeling of customer churn - Lessons learned from empirical and theoretical research PhD Thesis 2024. @phdthesis{nokey,
title = {Causal and predictive modeling of customer churn - Lessons learned from empirical and theoretical research},
author = {Theo Verhelst},
url = {https://difusion.ulb.ac.be/vufind/Record/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/368384/Holdings},
year = {2024},
date = {2024-01-29},
urldate = {2024-01-29},
abstract = {Customer churn is an important concern for large companies, especially in the
telecommunications sector. Customer retention campaigns are often used to mitigate
churn, but targeting the right customers based on their historical profiles
presents an important challenge. Companies usually have recourse to two datadriven
approaches: churn prediction and uplift modeling. In churn prediction,
customers are selected on the basis of their propensity to churn in the near future.
In uplift modeling, only customers who react positively to the campaign
are considered. Uplift modeling is used in various other domains, such as marketing,
healthcare, and finance. Despite the theoretical appeal of uplift modeling, its
added value with respect to conventional machine learning approaches has rarely
been quantified in the literature.
This doctoral thesis is the result of a collaborative research project between
the Machine Learning Group (ULB) and Orange Belgium, funded by Innoviris.
This collaboration offers a unique research opportunity to assess the added value
of causal-oriented strategies to address customer churn in the telecommunication
sector. Following the introduction, we give the necessary background in probability
theory, causality theory, and machine learning, and we describe the state of
the art in uplift modeling and counterfactual identification. Then, we present the
contributions of this thesis:
• An empirical comparison of various predictive and causal models for selecting
customers in churn prevention campaigns. We perform several benchmarks
of different state-of-the-art approaches on real-world datasets and in
live campaigns with our industrial partner, we propose a new approach that
exploits domain knowledge to improve predictions, and we make available
the first public churn dataset for uplift modeling, whose unique characteristics
make it more challenging than the few other public uplift datasets.
• Counterfactual identification allows one to classify the different behaviors
of customers in response to a marketing incentive. This can be used to establish
profiles of customers sensitive to the campaign, and subsequently
improve marketing operations. We derive novel bounds and point estimators
on the probability of counterfactual statements based on uplift models.
• A comprehensive comparison of predictive and uplift modeling, starting
from firm theoretical foundations and highlighting the parameters that influence
the performance of both approaches. In particular, we provide a new
formulation of the measure of profit, a formal proof of the convergence of
the uplift curve to the measure of profit, and an illustration, through simulations,
of the conditions under which predictive approaches still outperform
uplift modeling.
Our theoretical and empirical assessments of uplift modeling suggest that it often
fails to deliver the anticipated advantages over predictive modeling, especially in
scenarios such as customer churn within the telecom sector, characterized by class
imbalance, limited separability, and cost-benefit considerations. These results are
broadly aligned with the practical experience of our industrial partner and with
the existing scientific literature. Our counterfactual probability estimators allow
us to characterize customers at a level inaccessible to conventional predictive modeling,
revealing new insights on the behavior and preferences of customers.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Customer churn is an important concern for large companies, especially in the
telecommunications sector. Customer retention campaigns are often used to mitigate
churn, but targeting the right customers based on their historical profiles
presents an important challenge. Companies usually have recourse to two datadriven
approaches: churn prediction and uplift modeling. In churn prediction,
customers are selected on the basis of their propensity to churn in the near future.
In uplift modeling, only customers who react positively to the campaign
are considered. Uplift modeling is used in various other domains, such as marketing,
healthcare, and finance. Despite the theoretical appeal of uplift modeling, its
added value with respect to conventional machine learning approaches has rarely
been quantified in the literature.
This doctoral thesis is the result of a collaborative research project between
the Machine Learning Group (ULB) and Orange Belgium, funded by Innoviris.
This collaboration offers a unique research opportunity to assess the added value
of causal-oriented strategies to address customer churn in the telecommunication
sector. Following the introduction, we give the necessary background in probability
theory, causality theory, and machine learning, and we describe the state of
the art in uplift modeling and counterfactual identification. Then, we present the
contributions of this thesis:
• An empirical comparison of various predictive and causal models for selecting
customers in churn prevention campaigns. We perform several benchmarks
of different state-of-the-art approaches on real-world datasets and in
live campaigns with our industrial partner, we propose a new approach that
exploits domain knowledge to improve predictions, and we make available
the first public churn dataset for uplift modeling, whose unique characteristics
make it more challenging than the few other public uplift datasets.
• Counterfactual identification allows one to classify the different behaviors
of customers in response to a marketing incentive. This can be used to establish
profiles of customers sensitive to the campaign, and subsequently
improve marketing operations. We derive novel bounds and point estimators
on the probability of counterfactual statements based on uplift models.
• A comprehensive comparison of predictive and uplift modeling, starting
from firm theoretical foundations and highlighting the parameters that influence
the performance of both approaches. In particular, we provide a new
formulation of the measure of profit, a formal proof of the convergence of
the uplift curve to the measure of profit, and an illustration, through simulations,
of the conditions under which predictive approaches still outperform
uplift modeling.
Our theoretical and empirical assessments of uplift modeling suggest that it often
fails to deliver the anticipated advantages over predictive modeling, especially in
scenarios such as customer churn within the telecom sector, characterized by class
imbalance, limited separability, and cost-benefit considerations. These results are
broadly aligned with the practical experience of our industrial partner and with
the existing scientific literature. Our counterfactual probability estimators allow
us to characterize customers at a level inaccessible to conventional predictive modeling,
revealing new insights on the behavior and preferences of customers. |
2022
|
Ciortan, Madalina Unsupervised analysis of scRNA-seq data with machine learning models PhD Thesis 2022, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/340848,
title = {Unsupervised analysis of scRNA-seq data with machine learning models},
author = {Madalina Ciortan},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/340848},
year = {2022},
date = {2022-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Stefani, Jacopo De Towards multivariate multi-step-ahead time series forecasting : A machine learning perspective PhD Thesis 2022, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/340052,
title = {Towards multivariate multi-step-ahead time series forecasting : A machine learning perspective},
author = {Jacopo De Stefani},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/340052},
year = {2022},
date = {2022-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2021
|
Buroni, Giovanni On-Board-Unit big data analytics: from data architecture to traffic forecasting PhD Thesis 2021, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/334819,
title = {On-Board-Unit big data analytics: from data architecture to traffic forecasting},
author = {Giovanni Buroni},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/334819},
year = {2021},
date = {2021-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2020
|
P`ere, Nathaniel Mon Statistical biophysics of hematopoiesis and growing cell populations PhD Thesis 2020, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/314684,
title = {Statistical biophysics of hematopoiesis and growing cell populations},
author = {Nathaniel Mon P`ere},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/314684},
year = {2020},
date = {2020-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Papadimitriou, Sofia Towards multivariant pathogenicity predictions: Using machine-learning to directly predict and explore disease-causing oligogenic variant combinations PhD Thesis 2020, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/312576,
title = {Towards multivariant pathogenicity predictions: Using machine-learning to directly predict and explore disease-causing oligogenic variant combinations},
author = {Sofia Papadimitriou},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/312576},
year = {2020},
date = {2020-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2019
|
Orlando, Gabriele The role of dynamics in emergent protein properties PhD Thesis 2019, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/287368,
title = {The role of dynamics in emergent protein properties},
author = {Gabriele Orlando},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/287368},
year = {2019},
date = {2019-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
isabelle Davila, Caroll Weneya´a – "quien habla con los cerros”. Memoria, mántica y paisaje sagrado en la Sierra Norte de Oaxaca PhD Thesis 2019, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/285389,
title = {Weneya´a – "quien habla con los cerros”. Memoria, mántica y paisaje sagrado en la Sierra Norte de Oaxaca},
author = {Caroll isabelle Davila},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/285389},
year = {2019},
date = {2019-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2018
|
Gazzo, Andrea Beyond monogenic diseases: a first collection and analysis of digenic diseases PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/272617,
title = {Beyond monogenic diseases: a first collection and analysis of digenic diseases},
author = {Andrea Gazzo},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/272617},
year = {2018},
date = {2018-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Carcillo, Fabrizio Beyond Supervised Learning in Credit Card Fraud Detection: A Dive into Semi-supervised and Distributed Learning PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/272119b,
title = {Beyond Supervised Learning in Credit Card Fraud Detection: A Dive into Semi-supervised and Distributed Learning},
author = {Fabrizio Carcillo},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/272119},
year = {2018},
date = {2018-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Bizet, Martin Bioinformatic inference of a prognostic epigenetic signature of immunity in breast cancers PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/265092b,
title = {Bioinformatic inference of a prognostic epigenetic signature of immunity in breast cancers},
author = {Martin Bizet},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/265092},
year = {2018},
date = {2018-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Chen, Jixin Some Domain Decomposition and Convex Optimization Algorithms with Applications to Inverse Problems PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/271782,
title = {Some Domain Decomposition and Convex Optimization Algorithms with Applications to Inverse Problems},
author = {Jixin Chen},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/271782},
year = {2018},
date = {2018-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Reggiani, Claudio Bioinformatic discovery of novel exons expressed in human brain and their association with neurodevelopmental disorders PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/270994b,
title = {Bioinformatic discovery of novel exons expressed in human brain and their association with neurodevelopmental disorders},
author = {Claudio Reggiani},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/270994},
year = {2018},
date = {2018-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Carcillo, Fabrizio Beyond Supervised Learning in Credit Card Fraud Detection: A Dive into Semi-supervised and Distributed Learning PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/272119,
title = {Beyond Supervised Learning in Credit Card Fraud Detection: A Dive into Semi-supervised and Distributed Learning},
author = {Fabrizio Carcillo},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/272119/5/ContratDiCarcillo.pdf},
year = {2018},
date = {2018-01-01},
abstract = {The expansion of the electronic commerce, as well as the increasing confidence of customers in electronic payments, makes of fraud detection a critical issue. The design of a prompt and accurate Fraud Detection System is a priority for many organizations in the business of credit cards. In this thesis we present a series of studies to increase the precision and the speed of fraud detection system. The thesis has three main contributions. The first concerns the integration of unsupervised techniques and supervised classifiers. We proposed several approaches to integrate outlier scores in the detection process and we found that the accuracy of a conventional classifier may be improved when information about the input distribution is used to augment the training set.The second contribution concerns the role of active learning in Fraud Detection. We have extensively compared several state-of-the-art techniques and found that Stochastic Semi-supervised Learning is a convenient approach to tackle the Selection Bias problem in the active learning process.The third contribution of the thesis is the design, implementation and assessment of SCARFF, an original framework for near real-time Streaming Fraud Detection. This framework integrates Big Data technology (notably tools like Kafka, Spark and Cassandra) with a machine learning approach to deal with imbalance, non-stationarity and feedback latency in a scalable manner. Experimental results on a massive dataset of real credit card transactions have showed that our framework is scalable, efficient and accurate over a big stream of transactions.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
The expansion of the electronic commerce, as well as the increasing confidence of customers in electronic payments, makes of fraud detection a critical issue. The design of a prompt and accurate Fraud Detection System is a priority for many organizations in the business of credit cards. In this thesis we present a series of studies to increase the precision and the speed of fraud detection system. The thesis has three main contributions. The first concerns the integration of unsupervised techniques and supervised classifiers. We proposed several approaches to integrate outlier scores in the detection process and we found that the accuracy of a conventional classifier may be improved when information about the input distribution is used to augment the training set.The second contribution concerns the role of active learning in Fraud Detection. We have extensively compared several state-of-the-art techniques and found that Stochastic Semi-supervised Learning is a convenient approach to tackle the Selection Bias problem in the active learning process.The third contribution of the thesis is the design, implementation and assessment of SCARFF, an original framework for near real-time Streaming Fraud Detection. This framework integrates Big Data technology (notably tools like Kafka, Spark and Cassandra) with a machine learning approach to deal with imbalance, non-stationarity and feedback latency in a scalable manner. Experimental results on a massive dataset of real credit card transactions have showed that our framework is scalable, efficient and accurate over a big stream of transactions. |
Bizet, Martin Bioinformatic inference of a prognostic epigenetic signature of immunity in breast cancers PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/265092,
title = {Bioinformatic inference of a prognostic epigenetic signature of immunity in breast cancers},
author = {Martin Bizet},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/265092/7/ContratDiBizet.pdf},
year = {2018},
date = {2018-01-01},
abstract = {L'altération des marques épigénétiques est de plus en plus reconnue comme une caractéristique fondamentale des cancers. Dans cette th`ese, nous avons utilisé des profils de méthylation de l'ADN en vue d'améliorer la classification des patients atteints du cancer du sein gr^ace `a une approche basée sur l'apprentissage automatique. L'objectif `a long terme est le développement d'outils cliniques de médecine personnalisée. Les données de méthylation de l'ADN furent acquises `a l'aide d'une puce `a ADN dédiée `a la méthylation, appelée Infinium. Cette technologie est récente comparée, par exemple, aux puces d'expression génique et son prétraitement n'est pas encore standardisé. La premi`ere partie de cette th`ese fut donc consacrée `a l'évaluation des méthodes de normalisation par comparaison des données normalisées avec d'autres technologies (pyroséquenc cage et RRBS) pour les deux technologies Infinium les plus récentes (450k et 850k). Nous avons également évalué la couverture de régions biologiquement relevantes (promoteurs et amplificateurs) par les deux technologies. Ensuite, nous avons utilisé les données Infinium (correctement prétraitées) pour développer un score, appelé MeTIL score, qui présente une valeur pronostique et prédictive dans les cancers du sein. Nous avons profité de la capacité de la méthylation de l'ADN `a refléter la composition cellulaire pour extraire une signature de méthylation (c'est-`a-dire un ensemble de positions de l'ADN o`u la méthylation varie) qui refl`ete la présence de lymphocytes dans l'échantillon tumoral. Apr`es une sélection de sites présentant une méthylation spécifique aux lymphocytes, nous avons développé une approche basée sur l'apprentissage automatique pour obtenir une signature d'une tailleoptimale réduite `a cinq sites permettant potentiellement une utilisation en clinique. Apr`es conversion de cette signature en un score, nous avons montré sa spécificité pour les lymphocytes `a l'aide de données externes et de simulations informatiques. Puis, nous avons montré la capacité du MeTIL score `a prédire la réponse `a la chimiothérapie ainsi que son pouvoir pronostique dans des cohortes indépendantes de cancer du sein et, m^eme, dans d'autres cancers.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
L'altération des marques épigénétiques est de plus en plus reconnue comme une caractéristique fondamentale des cancers. Dans cette th`ese, nous avons utilisé des profils de méthylation de l'ADN en vue d'améliorer la classification des patients atteints du cancer du sein gr^ace `a une approche basée sur l'apprentissage automatique. L'objectif `a long terme est le développement d'outils cliniques de médecine personnalisée. Les données de méthylation de l'ADN furent acquises `a l'aide d'une puce `a ADN dédiée `a la méthylation, appelée Infinium. Cette technologie est récente comparée, par exemple, aux puces d'expression génique et son prétraitement n'est pas encore standardisé. La premi`ere partie de cette th`ese fut donc consacrée `a l'évaluation des méthodes de normalisation par comparaison des données normalisées avec d'autres technologies (pyroséquenc cage et RRBS) pour les deux technologies Infinium les plus récentes (450k et 850k). Nous avons également évalué la couverture de régions biologiquement relevantes (promoteurs et amplificateurs) par les deux technologies. Ensuite, nous avons utilisé les données Infinium (correctement prétraitées) pour développer un score, appelé MeTIL score, qui présente une valeur pronostique et prédictive dans les cancers du sein. Nous avons profité de la capacité de la méthylation de l'ADN `a refléter la composition cellulaire pour extraire une signature de méthylation (c'est-`a-dire un ensemble de positions de l'ADN o`u la méthylation varie) qui refl`ete la présence de lymphocytes dans l'échantillon tumoral. Apr`es une sélection de sites présentant une méthylation spécifique aux lymphocytes, nous avons développé une approche basée sur l'apprentissage automatique pour obtenir une signature d'une tailleoptimale réduite `a cinq sites permettant potentiellement une utilisation en clinique. Apr`es conversion de cette signature en un score, nous avons montré sa spécificité pour les lymphocytes `a l'aide de données externes et de simulations informatiques. Puis, nous avons montré la capacité du MeTIL score `a prédire la réponse `a la chimiothérapie ainsi que son pouvoir pronostique dans des cohortes indépendantes de cancer du sein et, m^eme, dans d'autres cancers. |
Reggiani, Claudio Bioinformatic discovery of novel exons expressed in human brain and their association with neurodevelopmental disorders PhD Thesis 2018, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/270994,
title = {Bioinformatic discovery of novel exons expressed in human brain and their association with neurodevelopmental disorders},
author = {Claudio Reggiani},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/270994/5/ContratDiReggiani.pdf},
year = {2018},
date = {2018-01-01},
abstract = {An important quest in genomics since the publication of the first complete human genome in 2003 has been its functional annotation. DNA holds the instructions to the production of the components necessary for the life of cells and organisms. A complete functional catalog of genomic regions will help the understanding of the cell body and its dynamics, thus creating links between genotype and phenotypic traits. The need for annotations prompted the development of several bioinformatic methods. In the context of promoter and first exon predictors, the majority of models relies principally on structural and chemical properties of the DNA sequence. Some of them integrate information from epigenomic and transcriptomic data as secondary features. Current genomic research asserts that reference genome annotations are far from being fully annotated (human organism included).Physicians rely on reference genome annotations and functional databases to understand disorders with genetic basis, and missing annotations may lead to unresolved cases. Because of their complexity, neurodevelopmental disorders are under study to figure out all genetic regions that are involved. Besides functional validation on model organisms, the search for genotype-phenotype association is supported by statistical analysis, which is typically biased towards known functional regions.This thesis addresses the use of an in-silico integrative analysis to improve reference genome annotations and discover novel functional regions associated with neurodevelopemental disorders. The contributions outlined in this document have practical applications in clinical settings. The presented bioinformatic method is based on epigenomic and transcriptomic data, thus excluding features from DNA sequence. Such integrative approach applied on brain data allowed the discovery of two novel promoters and coding first exons in the human DLG2 gene, which were also found to be statistically associated with neurodevelopmental disorders and intellectual disability in particular. The application of the same methodology to the whole genome resulted in the discovery of other novel exons expressed in brain. Concerning the in-silico method itself, the research demanded a high number of functional and clinical datasets to properly support and validate our discoveries.This work describes a bioinformatic method for genome annotation, in the specific area of promoter and first exons. So far the method has been applied on brain data, and the extension to the whole body data would be a logical by-product. We will leverage distributed frameworks to tackle the even higher amount of data to analyse, a task that has already begun. Another interesting research direction that came up from this work is the temporal enrichment analysis of epigenomics data across different developmental stages, in which changes of epigenomic enrichment suggest time-specific and tissue-specific functional gene and gene isoforms regulation.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
An important quest in genomics since the publication of the first complete human genome in 2003 has been its functional annotation. DNA holds the instructions to the production of the components necessary for the life of cells and organisms. A complete functional catalog of genomic regions will help the understanding of the cell body and its dynamics, thus creating links between genotype and phenotypic traits. The need for annotations prompted the development of several bioinformatic methods. In the context of promoter and first exon predictors, the majority of models relies principally on structural and chemical properties of the DNA sequence. Some of them integrate information from epigenomic and transcriptomic data as secondary features. Current genomic research asserts that reference genome annotations are far from being fully annotated (human organism included).Physicians rely on reference genome annotations and functional databases to understand disorders with genetic basis, and missing annotations may lead to unresolved cases. Because of their complexity, neurodevelopmental disorders are under study to figure out all genetic regions that are involved. Besides functional validation on model organisms, the search for genotype-phenotype association is supported by statistical analysis, which is typically biased towards known functional regions.This thesis addresses the use of an in-silico integrative analysis to improve reference genome annotations and discover novel functional regions associated with neurodevelopemental disorders. The contributions outlined in this document have practical applications in clinical settings. The presented bioinformatic method is based on epigenomic and transcriptomic data, thus excluding features from DNA sequence. Such integrative approach applied on brain data allowed the discovery of two novel promoters and coding first exons in the human DLG2 gene, which were also found to be statistically associated with neurodevelopmental disorders and intellectual disability in particular. The application of the same methodology to the whole genome resulted in the discovery of other novel exons expressed in brain. Concerning the in-silico method itself, the research demanded a high number of functional and clinical datasets to properly support and validate our discoveries.This work describes a bioinformatic method for genome annotation, in the specific area of promoter and first exons. So far the method has been applied on brain data, and the extension to the whole body data would be a logical by-product. We will leverage distributed frameworks to tackle the even higher amount of data to analyse, a task that has already begun. Another interesting research direction that came up from this work is the temporal enrichment analysis of epigenomics data across different developmental stages, in which changes of epigenomic enrichment suggest time-specific and tissue-specific functional gene and gene isoforms regulation. |
2017
|
Amghar, Mohamed Multiscale local polynomial transforms in smoothing and density estimation PhD Thesis 2017, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/262040,
title = {Multiscale local polynomial transforms in smoothing and density estimation},
author = {Mohamed Amghar},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/262040},
year = {2017},
date = {2017-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Raimondi, Daniele The effect of genome variation on human proteins: understanding variants and improving their deleteriousness prediction through extensive contextualisation PhD Thesis 2017, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/251313,
title = {The effect of genome variation on human proteins: understanding variants and improving their deleteriousness prediction through extensive contextualisation},
author = {Daniele Raimondi},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/251313},
year = {2017},
date = {2017-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2016
|
Zisis, Ioannis The Effect of Group Formation on Behaviour: An Experimental and Evolutionary Analysis PhD Thesis 2016, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/231974,
title = {The Effect of Group Formation on Behaviour: An Experimental and Evolutionary Analysis},
author = {Ioannis Zisis},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/231974},
year = {2016},
date = {2016-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
2015
|
Pozzolo, Andrea Dal Adaptive Machine Learning for Credit Card Fraud Detection PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/221654b,
title = {Adaptive Machine Learning for Credit Card Fraud Detection},
author = {Andrea Dal Pozzolo},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/221654},
year = {2015},
date = {2015-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Lopes, Miguel Inference of gene networks from time series expression data and application to type 1 Diabetes PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/216729b,
title = {Inference of gene networks from time series expression data and application to type 1 Diabetes},
author = {Miguel Lopes},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/216729},
year = {2015},
date = {2015-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Lerman, Liran A machine learning approach for automatic and generic side-channel attacks PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209070b,
title = {A machine learning approach for automatic and generic side-channel attacks},
author = {Liran Lerman},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/209070},
year = {2015},
date = {2015-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Hajingabo, Leon Analyzing molecular network perturbations in human cancer: application to mutated genes and gene fusions involved in acute lymphoblastic leukemia PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209126b,
title = {Analyzing molecular network perturbations in human cancer: application to mutated genes and gene fusions involved in acute lymphoblastic leukemia},
author = {Leon Hajingabo},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/209126},
year = {2015},
date = {2015-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Pozzolo, Andrea Dal Adaptive Machine Learning for Credit Card Fraud Detection PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/221654,
title = {Adaptive Machine Learning for Credit Card Fraud Detection},
author = {Andrea Dal Pozzolo},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/221654/5/contratDalPozzolo.pdf},
year = {2015},
date = {2015-01-01},
abstract = {Billions of dollars of loss are caused every year by fraudulent credit card transactions. The design of efficient fraud detection algorithms is key for reducing these losses, and more and more algorithms rely on advanced machine learning techniques to assist fraud investigators. The design of fraud detection algorithms is however particularly challenging due to the non-stationary distribution of the data, the highly unbalanced classes distributions and the availability of few transactions labeled by fraud investigators. At the same time public data are scarcely available for confidentiality issues, leaving unanswered many questions about what is the best strategy. In this thesis we aim to provide some answers by focusing on crucial issues such as: i) why and how undersampling is useful in the presence of class imbalance (i.e. frauds are a small percentage of the transactions), ii) how to deal with unbalanced and evolving data streams (non-stationarity due to fraud evolution and change of spending behavior), iii) how to assess performances in a way which is relevant for detection and iv) how to use feedbacks provided by investigators on the fraud alerts generated. Finally, we design and assess a prototype of a Fraud Detection System able to meet real-world working conditions and that is able to integrate investigators' feedback to generate accurate alerts.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Billions of dollars of loss are caused every year by fraudulent credit card transactions. The design of efficient fraud detection algorithms is key for reducing these losses, and more and more algorithms rely on advanced machine learning techniques to assist fraud investigators. The design of fraud detection algorithms is however particularly challenging due to the non-stationary distribution of the data, the highly unbalanced classes distributions and the availability of few transactions labeled by fraud investigators. At the same time public data are scarcely available for confidentiality issues, leaving unanswered many questions about what is the best strategy. In this thesis we aim to provide some answers by focusing on crucial issues such as: i) why and how undersampling is useful in the presence of class imbalance (i.e. frauds are a small percentage of the transactions), ii) how to deal with unbalanced and evolving data streams (non-stationarity due to fraud evolution and change of spending behavior), iii) how to assess performances in a way which is relevant for detection and iv) how to use feedbacks provided by investigators on the fraud alerts generated. Finally, we design and assess a prototype of a Fraud Detection System able to meet real-world working conditions and that is able to integrate investigators' feedback to generate accurate alerts. |
Lopes, Miguel Inference of gene networks from time series expression data and application to type 1 Diabetes PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/216729,
title = {Inference of gene networks from time series expression data and application to type 1 Diabetes},
author = {Miguel Lopes},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/216729/6/contratGasparLopes.pdf},
year = {2015},
date = {2015-01-01},
abstract = {The inference of gene regulatory networks (GRN) is of great importance to medical research, as causal mechanisms responsible for phenotypes are unravelled and potential therapeutical targets identified. In type 1 diabetes, insulin producing pancreatic beta-cells are the target of an auto-immune attack leading to apoptosis (cell suicide). Although key genes and regulations have been identified, a precise characterization of the process leading to beta-cell apoptosis has not been achieved yet. The inference of relevant molecular pathways in type 1 diabetes is then a crucial research topic. GRN inference from gene expression data (obtained from microarrays and RNA-seq technology) is a causal inference problem which may be tackled with well-established statistical and machine learning concepts. In particular, the use of time series facilitates the identification of the causal direction in cause-effect gene pairs. However, inference from gene expression data is a very challenging problem due to the large number of existing genes (in human, over twenty thousand) and the typical low number of samples in gene expression datasets. In this context, it is important to correctly assess the accuracy of network inference methods. The contributions of this thesis are on three distinct aspects. The first is on inference assessment using precision-recall curves, in particular using the area under the curve (AUPRC). The typical approach to assess AUPRC significance is using Monte Carlo, and a parametric alternative is proposed. It consists on deriving the mean and variance of the null AUPRC and then using these parameters to fit a beta distribution approximating the true distribution. The second contribution is an investigation on network inference from time series. Several state of the art strategies are experimentally assessed and novel heuristics are proposed. One is a fast approximation of first order Granger causality scores, suited for GRN inference in the large variable case. Another identifies co-regulated genes (ie. regulated by the same genes). Both are experimentally validated using microarray and simulated time series. The third contribution of this thesis is on the context of type 1 diabetes and is a study on beta cell gene expression after exposure to cytokines, emulating the mechanisms leading to apoptosis. 8 datasets of beta cell gene expression were used to identify differentially expressed genes before and after 24h, which were functionally characterized using bioinformatics tools. The two most differentially expressed genes, previously unknown in the type 1 Diabetes literature (RIPK2 and ELF3) were found to modulate cytokine induced apoptosis. A regulatory network was then inferred using a dynamic adaptation of a state of the art network inference method. Three out of four predicted regulations (involving RIPK2 and ELF3) were experimentally confirmed, providing a proof of concept for the adopted approach.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
The inference of gene regulatory networks (GRN) is of great importance to medical research, as causal mechanisms responsible for phenotypes are unravelled and potential therapeutical targets identified. In type 1 diabetes, insulin producing pancreatic beta-cells are the target of an auto-immune attack leading to apoptosis (cell suicide). Although key genes and regulations have been identified, a precise characterization of the process leading to beta-cell apoptosis has not been achieved yet. The inference of relevant molecular pathways in type 1 diabetes is then a crucial research topic. GRN inference from gene expression data (obtained from microarrays and RNA-seq technology) is a causal inference problem which may be tackled with well-established statistical and machine learning concepts. In particular, the use of time series facilitates the identification of the causal direction in cause-effect gene pairs. However, inference from gene expression data is a very challenging problem due to the large number of existing genes (in human, over twenty thousand) and the typical low number of samples in gene expression datasets. In this context, it is important to correctly assess the accuracy of network inference methods. The contributions of this thesis are on three distinct aspects. The first is on inference assessment using precision-recall curves, in particular using the area under the curve (AUPRC). The typical approach to assess AUPRC significance is using Monte Carlo, and a parametric alternative is proposed. It consists on deriving the mean and variance of the null AUPRC and then using these parameters to fit a beta distribution approximating the true distribution. The second contribution is an investigation on network inference from time series. Several state of the art strategies are experimentally assessed and novel heuristics are proposed. One is a fast approximation of first order Granger causality scores, suited for GRN inference in the large variable case. Another identifies co-regulated genes (ie. regulated by the same genes). Both are experimentally validated using microarray and simulated time series. The third contribution of this thesis is on the context of type 1 diabetes and is a study on beta cell gene expression after exposure to cytokines, emulating the mechanisms leading to apoptosis. 8 datasets of beta cell gene expression were used to identify differentially expressed genes before and after 24h, which were functionally characterized using bioinformatics tools. The two most differentially expressed genes, previously unknown in the type 1 Diabetes literature (RIPK2 and ELF3) were found to modulate cytokine induced apoptosis. A regulatory network was then inferred using a dynamic adaptation of a state of the art network inference method. Three out of four predicted regulations (involving RIPK2 and ELF3) were experimentally confirmed, providing a proof of concept for the adopted approach. |
Lerman, Liran A machine learning approach for automatic and generic side-channel attacks PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209070,
title = {A machine learning approach for automatic and generic side-channel attacks},
author = {Liran Lerman},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/209070/2/be487c5b-7b94-414c-bf2e-96847aa98284.txt},
year = {2015},
date = {2015-01-01},
abstract = {L'omniprésence de dispositifs interconnectés am`ene `a un intér^et massif pour la sécurité informatique fournie entre autres par le domaine de la cryptographie. Pendant des décennies, les spécialistes en cryptographie estimaient le niveau de sécurité d'un algorithme cryptographique indépendamment de son implantation dans un dispositif. Cependant, depuis la publication des attaques d'implantation en 1996, les attaques physiques sont devenues un domaine de recherche actif en considérant les propriétés physiques de dispositifs cryptographiques. Dans notre dissertation, nous nous concentrons sur les attaques profilées. Traditionnellement, les attaques profilées appliquent des méthodes paramétriques dans lesquelles une information a priori sur les propriétés physiques est supposée. Le domaine de l'apprentissage automatique produit des mod`eles automatiques et génériques ne nécessitant pas une information a priori sur le phénom`ene étudié.<p><p>Cette dissertation apporte un éclairage nouveau sur les capacités des méthodes d'apprentissage automatique. Nous démontrons d'abord que les attaques profilées paramétriques surpassent les méthodes d'apprentissage automatique lorsqu'il n'y a pas d'erreur d'estimation ni d'hypoth`ese. En revanche, les attaques fondées sur l'apprentissage automatique sont avantageuses dans des scénarios réalistes o`u le nombre de données lors de l'étape d'apprentissage est faible. Par la suite, nous proposons une nouvelle métrique formelle d'évaluation qui permet (1) de comparer des attaques paramétriques et non-paramétriques et (2) d'interpréter les résultats de chaque méthode. La nouvelle mesure fournit les causes d'un taux de réussite élevé ou faible d'une attaque et, par conséquent, donne des pistes pour améliorer l'évaluation d'une implantation. Enfin, nous présentons des résultats expérimentaux sur des appareils non protégés et protégés. La premi`ere étude montre que l'apprentissage automatique a un taux de réussite plus élevé qu'une méthode paramétrique lorsque seules quelques données sont disponibles. La deuxi`eme expérience démontre qu'un dispositif protégé est attaquable avec une approche appartenant `a l'apprentissage automatique. La stratégie basée sur l'apprentissage automatique nécessite le m^eme nombre de données lors de la phase d'apprentissage que lorsque celle-ci attaque un produit non protégé. Nous montrons également que des méthodes paramétriques surestiment ou sous-estiment le niveau de sécurité fourni par l'appareil alors que l'approche basée sur l'apprentissage automatique améliore cette estimation. <p><p>En résumé, notre th`ese est que les attaques basées sur l'apprentissage automatique sont avantageuses par rapport aux techniques classiques lorsque la quantité d'information a priori sur l'appareil cible et le nombre de données lors de la phase d'apprentissage sont faibles.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
L'omniprésence de dispositifs interconnectés am`ene `a un intér^et massif pour la sécurité informatique fournie entre autres par le domaine de la cryptographie. Pendant des décennies, les spécialistes en cryptographie estimaient le niveau de sécurité d'un algorithme cryptographique indépendamment de son implantation dans un dispositif. Cependant, depuis la publication des attaques d'implantation en 1996, les attaques physiques sont devenues un domaine de recherche actif en considérant les propriétés physiques de dispositifs cryptographiques. Dans notre dissertation, nous nous concentrons sur les attaques profilées. Traditionnellement, les attaques profilées appliquent des méthodes paramétriques dans lesquelles une information a priori sur les propriétés physiques est supposée. Le domaine de l'apprentissage automatique produit des mod`eles automatiques et génériques ne nécessitant pas une information a priori sur le phénom`ene étudié.<p><p>Cette dissertation apporte un éclairage nouveau sur les capacités des méthodes d'apprentissage automatique. Nous démontrons d'abord que les attaques profilées paramétriques surpassent les méthodes d'apprentissage automatique lorsqu'il n'y a pas d'erreur d'estimation ni d'hypoth`ese. En revanche, les attaques fondées sur l'apprentissage automatique sont avantageuses dans des scénarios réalistes o`u le nombre de données lors de l'étape d'apprentissage est faible. Par la suite, nous proposons une nouvelle métrique formelle d'évaluation qui permet (1) de comparer des attaques paramétriques et non-paramétriques et (2) d'interpréter les résultats de chaque méthode. La nouvelle mesure fournit les causes d'un taux de réussite élevé ou faible d'une attaque et, par conséquent, donne des pistes pour améliorer l'évaluation d'une implantation. Enfin, nous présentons des résultats expérimentaux sur des appareils non protégés et protégés. La premi`ere étude montre que l'apprentissage automatique a un taux de réussite plus élevé qu'une méthode paramétrique lorsque seules quelques données sont disponibles. La deuxi`eme expérience démontre qu'un dispositif protégé est attaquable avec une approche appartenant `a l'apprentissage automatique. La stratégie basée sur l'apprentissage automatique nécessite le m^eme nombre de données lors de la phase d'apprentissage que lorsque celle-ci attaque un produit non protégé. Nous montrons également que des méthodes paramétriques surestiment ou sous-estiment le niveau de sécurité fourni par l'appareil alors que l'approche basée sur l'apprentissage automatique améliore cette estimation. <p><p>En résumé, notre th`ese est que les attaques basées sur l'apprentissage automatique sont avantageuses par rapport aux techniques classiques lorsque la quantité d'information a priori sur l'appareil cible et le nombre de données lors de la phase d'apprentissage sont faibles. |
Hajingabo, Leon Analyzing molecular network perturbations in human cancer: application to mutated genes and gene fusions involved in acute lymphoblastic leukemia PhD Thesis 2015, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209126,
title = {Analyzing molecular network perturbations in human cancer: application to mutated genes and gene fusions involved in acute lymphoblastic leukemia},
author = {Leon Hajingabo},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/209126/3/d6d6225e-dfa8-46a3-985a-2ef19c71eff1.txt},
year = {2015},
date = {2015-01-01},
abstract = {Le séquenc cage du génome humain et l'émergence de nouvelles technologies de génomique `a haut débit, ont initié de nouveaux mod`eles d'investigation pour l'analyse systématique des maladies humaines. Actuellement, nous pouvons tenter de comprendre les maladies tel que le cancer avec une perspective plus globale, en identifiant des g`enes responsables des cancers et en étudiant la mani`ere dont leurs produits protéiques fonctionnent dans un réseau d'interactions moléculaires. Dans ce contexte, nous avons collecté les g`enes spécifiquement liés `a la Leucémie Lymphoblastique Aigu"e (LLA), et identifié de nouveaux partenaires d'interaction qui relient ces g`enes clés associés `a la LLA tels que NOTCH1, FBW7, KRAS et PTPN11, dans un réseau d'interactions. Nous avons également tenté de prédire l'impact fonctionnel des variations génomiques tel que des fusions de g`enes impliquées dans LLA. En utilisant comme mod`eles trois différentes translocations chromosomiques ETV6-RUNX1 (TEL-AML1), BCR-ABL1, et E2A-PBX1 (TCF3-PBX1) fréquemment identifiées dans des cellules B LLA, nous avons adapté une approche de prédiction d'oncog`enes afin de prédire des perturbations moléculaires dans la LLA. Nous avons montré que les circuits transcriptomiques dépendant de Myc et JunD sont spécifiquement dérégulés suite aux fusions de g`enes TEL-AML1 et TCF3-PBX1, respectivement. Nous avons également identifié le mécanisme de transport des ARNm dépendant du facteur NXF1 comme une cible directe de la protéine de fusion TCF3-PBX1. Gr^ace `a cette approche combinant les données interactomiques et les analyses d'expression génique, nous avons fourni un nouvel aperc cu `a la compréhension moléculaire de la Leucémie Lymphoblastique Aigu"e.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Le séquenc cage du génome humain et l'émergence de nouvelles technologies de génomique `a haut débit, ont initié de nouveaux mod`eles d'investigation pour l'analyse systématique des maladies humaines. Actuellement, nous pouvons tenter de comprendre les maladies tel que le cancer avec une perspective plus globale, en identifiant des g`enes responsables des cancers et en étudiant la mani`ere dont leurs produits protéiques fonctionnent dans un réseau d'interactions moléculaires. Dans ce contexte, nous avons collecté les g`enes spécifiquement liés `a la Leucémie Lymphoblastique Aigu"e (LLA), et identifié de nouveaux partenaires d'interaction qui relient ces g`enes clés associés `a la LLA tels que NOTCH1, FBW7, KRAS et PTPN11, dans un réseau d'interactions. Nous avons également tenté de prédire l'impact fonctionnel des variations génomiques tel que des fusions de g`enes impliquées dans LLA. En utilisant comme mod`eles trois différentes translocations chromosomiques ETV6-RUNX1 (TEL-AML1), BCR-ABL1, et E2A-PBX1 (TCF3-PBX1) fréquemment identifiées dans des cellules B LLA, nous avons adapté une approche de prédiction d'oncog`enes afin de prédire des perturbations moléculaires dans la LLA. Nous avons montré que les circuits transcriptomiques dépendant de Myc et JunD sont spécifiquement dérégulés suite aux fusions de g`enes TEL-AML1 et TCF3-PBX1, respectivement. Nous avons également identifié le mécanisme de transport des ARNm dépendant du facteur NXF1 comme une cible directe de la protéine de fusion TCF3-PBX1. Gr^ace `a cette approche combinant les données interactomiques et les analyses d'expression génique, nous avons fourni un nouvel aperc cu `a la compréhension moléculaire de la Leucémie Lymphoblastique Aigu"e. |
2014
|
Kidzinski, Lukasz Inference for stationary functional time series: dimension reduction and regression PhD Thesis 2014, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209226,
title = {Inference for stationary functional time series: dimension reduction and regression},
author = {Lukasz Kidzinski},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/209226},
year = {2014},
date = {2014-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Taieb, Souhaib Ben Machine learning strategies for multi-step-ahead time series forecasting PhD Thesis 2014, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209234b,
title = {Machine learning strategies for multi-step-ahead time series forecasting},
author = {Souhaib Ben Taieb},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/209234},
year = {2014},
date = {2014-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Taieb, Souhaib Ben Machine learning strategies for multi-step-ahead time series forecasting PhD Thesis 2014, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209234,
title = {Machine learning strategies for multi-step-ahead time series forecasting},
author = {Souhaib Ben Taieb},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/209234/4/2c5e8bfe-3eab-4c2a-acb0-843504ddfcbd.txt},
year = {2014},
date = {2014-01-01},
abstract = {How much electricity is going to be consumed for the next 24 hours? What will be the temperature for the next three days? What will be the number of sales of a certain product for the next few months? Answering these questions often requires forecasting several future observations from a given sequence of historical observations, called a time series. <p><p>Historically, time series forecasting has been mainly studied in econometrics and statistics. In the last two decades, machine learning, a field that is concerned with the development of algorithms that can automatically learn from data, has become one of the most active areas of predictive modeling research. This success is largely due to the superior performance of machine learning prediction algorithms in many different applications as diverse as natural language processing, speech recognition and spam detection. However, there has been very little research at the intersection of time series forecasting and machine learning.<p><p>The goal of this dissertation is to narrow this gap by addressing the problem of multi-step-ahead time series forecasting from the perspective of machine learning. To that end, we propose a series of forecasting strategies based on machine learning algorithms.<p><p>Multi-step-ahead forecasts can be produced recursively by iterating a one-step-ahead model, or directly using a specific model for each horizon. As a first contribution, we conduct an in-depth study to compare recursive and direct forecasts generated with different learning algorithms for different data generating processes. More precisely, we decompose the multi-step mean squared forecast errors into the bias and variance components, and analyze their behavior over the forecast horizon for different time series lengths. The results and observations made in this study then guide us for the development of new forecasting strategies.<p><p>In particular, we find that choosing between recursive and direct forecasts is not an easy task since it involves a trade-off between bias and estimation variance that depends on many interacting factors, including the learning model, the underlying data generating process, the time series length and the forecast horizon. As a second contribution, we develop multi-stage forecasting strategies that do not treat the recursive and direct strategies as competitors, but seek to combine their best properties. More precisely, the multi-stage strategies generate recursive linear forecasts, and then adjust these forecasts by modeling the multi-step forecast residuals with direct nonlinear models at each horizon, called rectification models. We propose a first multi-stage strategy, that we called the rectify strategy, which estimates the rectification models using the nearest neighbors model. However, because recursive linear forecasts often need small adjustments with real-world time series, we also consider a second multi-stage strategy, called the boost strategy, that estimates the rectification models using gradient boosting algorithms that use so-called weak learners.<p><p>Generating multi-step forecasts using a different model at each horizon provides a large modeling flexibility. However, selecting these models independently can lead to irregularities in the forecasts that can contribute to increase the forecast variance. The problem is exacerbated with nonlinear machine learning models estimated from short time series. To address this issue, and as a third contribution, we introduce and analyze multi-horizon forecasting strategies that exploit the information contained in other horizons when learning the model for each horizon. In particular, to select the lag order and the hyperparameters of each model, multi-horizon strategies minimize forecast errors over multiple horizons rather than just the horizon of interest.<p><p>We compare all the proposed strategies with both the recursive and direct strategies. We first apply a bias and variance study, then we evaluate the different strategies using real-world time series from two past forecasting competitions. For the rectify strategy, in addition to avoiding the choice between recursive and direct forecasts, the results demonstrate that it has better, or at least has close performance to, the best of the recursive and direct forecasts in different settings. For the multi-horizon strategies, the results emphasize the decrease in variance compared to single-horizon strategies, especially with linear or weakly nonlinear data generating processes. Overall, we found that the accuracy of multi-step-ahead forecasts based on machine learning algorithms can be significantly improved if an appropriate forecasting strategy is used to select the model parameters and to generate the forecasts.<p><p>Lastly, as a fourth contribution, we have participated in the Load Forecasting track of the Global Energy Forecasting Competition 2012. The competition involved a hierarchical load forecasting problem where we were required to backcast and forecast hourly loads for a US utility with twenty geographical zones. Our team, TinTin, ranked fifth out of 105 participating teams, and we have been awarded an IEEE Power & Energy Society award.<p>},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
How much electricity is going to be consumed for the next 24 hours? What will be the temperature for the next three days? What will be the number of sales of a certain product for the next few months? Answering these questions often requires forecasting several future observations from a given sequence of historical observations, called a time series. <p><p>Historically, time series forecasting has been mainly studied in econometrics and statistics. In the last two decades, machine learning, a field that is concerned with the development of algorithms that can automatically learn from data, has become one of the most active areas of predictive modeling research. This success is largely due to the superior performance of machine learning prediction algorithms in many different applications as diverse as natural language processing, speech recognition and spam detection. However, there has been very little research at the intersection of time series forecasting and machine learning.<p><p>The goal of this dissertation is to narrow this gap by addressing the problem of multi-step-ahead time series forecasting from the perspective of machine learning. To that end, we propose a series of forecasting strategies based on machine learning algorithms.<p><p>Multi-step-ahead forecasts can be produced recursively by iterating a one-step-ahead model, or directly using a specific model for each horizon. As a first contribution, we conduct an in-depth study to compare recursive and direct forecasts generated with different learning algorithms for different data generating processes. More precisely, we decompose the multi-step mean squared forecast errors into the bias and variance components, and analyze their behavior over the forecast horizon for different time series lengths. The results and observations made in this study then guide us for the development of new forecasting strategies.<p><p>In particular, we find that choosing between recursive and direct forecasts is not an easy task since it involves a trade-off between bias and estimation variance that depends on many interacting factors, including the learning model, the underlying data generating process, the time series length and the forecast horizon. As a second contribution, we develop multi-stage forecasting strategies that do not treat the recursive and direct strategies as competitors, but seek to combine their best properties. More precisely, the multi-stage strategies generate recursive linear forecasts, and then adjust these forecasts by modeling the multi-step forecast residuals with direct nonlinear models at each horizon, called rectification models. We propose a first multi-stage strategy, that we called the rectify strategy, which estimates the rectification models using the nearest neighbors model. However, because recursive linear forecasts often need small adjustments with real-world time series, we also consider a second multi-stage strategy, called the boost strategy, that estimates the rectification models using gradient boosting algorithms that use so-called weak learners.<p><p>Generating multi-step forecasts using a different model at each horizon provides a large modeling flexibility. However, selecting these models independently can lead to irregularities in the forecasts that can contribute to increase the forecast variance. The problem is exacerbated with nonlinear machine learning models estimated from short time series. To address this issue, and as a third contribution, we introduce and analyze multi-horizon forecasting strategies that exploit the information contained in other horizons when learning the model for each horizon. In particular, to select the lag order and the hyperparameters of each model, multi-horizon strategies minimize forecast errors over multiple horizons rather than just the horizon of interest.<p><p>We compare all the proposed strategies with both the recursive and direct strategies. We first apply a bias and variance study, then we evaluate the different strategies using real-world time series from two past forecasting competitions. For the rectify strategy, in addition to avoiding the choice between recursive and direct forecasts, the results demonstrate that it has better, or at least has close performance to, the best of the recursive and direct forecasts in different settings. For the multi-horizon strategies, the results emphasize the decrease in variance compared to single-horizon strategies, especially with linear or weakly nonlinear data generating processes. Overall, we found that the accuracy of multi-step-ahead forecasts based on machine learning algorithms can be significantly improved if an appropriate forecasting strategy is used to select the model parameters and to generate the forecasts.<p><p>Lastly, as a fourth contribution, we have participated in the Load Forecasting track of the Global Energy Forecasting Competition 2012. The competition involved a hierarchical load forecasting problem where we were required to backcast and forecast hourly loads for a US utility with twenty geographical zones. Our team, TinTin, ranked fifth out of 105 participating teams, and we have been awarded an IEEE Power & Energy Society award.<p> |
2013
|
Olsen, Catharina Causal inference and prior integration in bioinformatics using information theory PhD Thesis 2013, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209401b,
title = {Causal inference and prior integration in bioinformatics using information theory},
author = {Catharina Olsen},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/209401},
year = {2013},
date = {2013-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Olsen, Catharina Causal inference and prior integration in bioinformatics using information theory PhD Thesis 2013, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209401,
title = {Causal inference and prior integration in bioinformatics using information theory},
author = {Catharina Olsen},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/209401/1/ba9583ce-51e9-4718-b438-fb816d60aea4.txt},
year = {2013},
date = {2013-01-01},
abstract = {An important problem in bioinformatics is the reconstruction of gene regulatory networks from expression data. The analysis of genomic data stemming from high- throughput technologies such as microarray experiments or RNA-sequencing faces several difficulties. The first major issue is the high variable to sample ratio which is due to a number of factors: a single experiment captures all genes while the number of experiments is restricted by the experiment's cost, time and patient cohort size. The second problem is that these data sets typically exhibit high amounts of noise.<p><p>Another important problem in bioinformatics is the question of how the inferred networks' quality can be evaluated. The current best practice is a two step procedure. In the first step, the highest scoring interactions are compared to known interactions stored in biological databases. The inferred networks passes this quality assessment if there is a large overlap with the known interactions. In this case, a second step is carried out in which unknown but high scoring and thus promising new interactions are validated 'by hand' via laboratory experiments. Unfortunately when integrating prior knowledge in the inference procedure, this validation procedure would be biased by using the same information in both the inference and the validation. Therefore, it would no longer allow an independent validation of the resulting network.<p><p>The main contribution of this thesis is a complete computational framework that uses experimental knock down data in a cross-validation scheme to both infer and validate directed networks. Its components are i) a method that integrates genomic data and prior knowledge to infer directed networks, ii) its implementation in an R/Bioconductor package and iii) a web application to retrieve prior knowledge from PubMed abstracts and biological databases. To infer directed networks from genomic data and prior knowledge, we propose a two step procedure: First, we adapt the pairwise feature selection strategy mRMR to integrate prior knowledge in order to obtain the network's skeleton. Then for the subsequent orientation phase of the algorithm, we extend a criterion based on interaction information to include prior knowledge. The implementation of this method is available both as part of the prior retrieval tool Predictive Networks and as a stand-alone R/Bioconductor package named predictionet.<p><p>Furthermore, we propose a fully data-driven quantitative validation of such directed networks using experimental knock-down data: We start by identifying the set of genes that was truly affected by the perturbation experiment. The rationale of our validation procedure is that these truly affected genes should also be part of the perturbed gene's childhood in the inferred network. Consequently, we can compute a performance score},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
An important problem in bioinformatics is the reconstruction of gene regulatory networks from expression data. The analysis of genomic data stemming from high- throughput technologies such as microarray experiments or RNA-sequencing faces several difficulties. The first major issue is the high variable to sample ratio which is due to a number of factors: a single experiment captures all genes while the number of experiments is restricted by the experiment's cost, time and patient cohort size. The second problem is that these data sets typically exhibit high amounts of noise.<p><p>Another important problem in bioinformatics is the question of how the inferred networks' quality can be evaluated. The current best practice is a two step procedure. In the first step, the highest scoring interactions are compared to known interactions stored in biological databases. The inferred networks passes this quality assessment if there is a large overlap with the known interactions. In this case, a second step is carried out in which unknown but high scoring and thus promising new interactions are validated 'by hand' via laboratory experiments. Unfortunately when integrating prior knowledge in the inference procedure, this validation procedure would be biased by using the same information in both the inference and the validation. Therefore, it would no longer allow an independent validation of the resulting network.<p><p>The main contribution of this thesis is a complete computational framework that uses experimental knock down data in a cross-validation scheme to both infer and validate directed networks. Its components are i) a method that integrates genomic data and prior knowledge to infer directed networks, ii) its implementation in an R/Bioconductor package and iii) a web application to retrieve prior knowledge from PubMed abstracts and biological databases. To infer directed networks from genomic data and prior knowledge, we propose a two step procedure: First, we adapt the pairwise feature selection strategy mRMR to integrate prior knowledge in order to obtain the network's skeleton. Then for the subsequent orientation phase of the algorithm, we extend a criterion based on interaction information to include prior knowledge. The implementation of this method is available both as part of the prior retrieval tool Predictive Networks and as a stand-alone R/Bioconductor package named predictionet.<p><p>Furthermore, we propose a fully data-driven quantitative validation of such directed networks using experimental knock-down data: We start by identifying the set of genes that was truly affected by the perturbation experiment. The rationale of our validation procedure is that these truly affected genes should also be part of the perturbed gene's childhood in the inferred network. Consequently, we can compute a performance score |
2011
|
Miranda, Abhilash Alexander Spectral factor model for time series learning PhD Thesis 2011, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209812b,
title = {Spectral factor model for time series learning},
author = {Abhilash Alexander Miranda},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/209812},
year = {2011},
date = {2011-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Miranda, Abhilash Alexander Spectral factor model for time series learning PhD Thesis 2011, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/209812,
title = {Spectral factor model for time series learning},
author = {Abhilash Alexander Miranda},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/209812/4/4feadeb4-adfa-49eb-957e-9e1bdfdcd5ed.txt},
year = {2011},
date = {2011-01-01},
abstract = {Today's computerized processes generate<p>massive amounts of streaming data.<p>In many applications, data is collected for modeling the processes. The process model is hoped to drive objectives such as decision support, data visualization, business intelligence, automation and control, pattern recognition and classification, etc. However, we face significant challenges in data-driven modeling of processes. Apart from the errors, outliers and noise in the data measurements, the main challenge is due to a large dimensionality, which is the number of variables each data sample measures. The samples often form a long temporal sequence called a multivariate time series where any one sample is influenced by the others.<p>We wish to build a model that will ensure robust generation, reviewing, and representation of new multivariate time series that are consistent with the underlying process.<p><p>In this thesis, we adopt a modeling framework to extract characteristics from multivariate time series that correspond to dynamic variation-covariation common to the measured variables across all the samples. Those characteristics of a multivariate time series are named its 'commonalities' and a suitable measure for them is defined. What makes the multivariate time series model versatile is the assumption regarding the existence of a latent time series of known or presumed characteristics and much lower dimensionality than the measured time series; the result is the well-known 'dynamic factor model'.<p>Original variants of existing methods for estimating the dynamic factor model are developed: The estimation is performed using the frequency-domain equivalent of the dynamic factor model named the 'spectral factor model'. To estimate the spectral factor model, ideas are sought from the asymptotic theory of spectral estimates. This theory is used to attain a probabilistic formulation, which provides maximum likelihood estimates for the spectral factor model parameters. Then, maximum likelihood parameters are developed with all the analysis entirely in the spectral-domain such that the dynamically transformed latent time series inherits the commonalities maximally.<p><p>The main contribution of this thesis is a learning framework using the spectral factor model. We term learning as the ability of a computational model of a process to robustly characterize the data the process generates for purposes of pattern matching, classification and prediction. Hence, the spectral factor model could be claimed to have learned a multivariate time series if the latent time series when dynamically transformed extracts the commonalities reliably and maximally. The spectral factor model will be used for mainly two multivariate time series learning applications: First, real-world streaming datasets obtained from various processes are to be classified; in this exercise, human brain magnetoencephalography signals obtained during various cognitive and physical tasks are classified. Second, the commonalities are put to test by asking for reliable prediction of a multivariate time series given its past evolution; share prices in a portfolio are forecasted as part of this challenge.<p><p>For both spectral factor modeling and learning, an analytical solution as well as an iterative solution are developed. While the analytical solution is based on low-rank approximation of the spectral density function, the iterative solution is based on the expectation-maximization algorithm. For the human brain signal classification exercise, a strategy for comparing similarities between the commonalities for various classes of multivariate time series processes is developed. For the share price prediction problem, a vector autoregressive model whose parameters are enriched with the maximum likelihood commonalities is designed. In both these learning problems, the spectral factor model gives commendable performance with respect to competing approaches.<p><p>Les processus informatisés actuels gén`erent des quantités massives de flux de données. Dans nombre d'applications, ces flux de données sont collectées en vue de modéliser les processus. Les mod`eles de processus obtenus ont pour but la réalisation d'objectifs tels que l'aide `a la décision, la visualisation de données, l'informatique décisionnelle, l'automatisation et le contr^ole, la reconnaissance de formes et la classification, etc. La modélisation de processus sur la base de données implique cependant de faire face `a d'importants défis. Outre les erreurs, les données aberrantes et le bruit, le principal défi provient de la large dimensionnalité, i.e. du nombre de variables dans chaque échantillon de données mesurées. Les échantillons forment souvent une longue séquence temporelle appelée série temporelle multivariée, o`u chaque échantillon est influencé par les autres. Notre objectif est de construire un mod`ele robuste qui garantisse la génération, la révision et la représentation de nouvelles séries temporelles multivariées cohérentes avec le processus sous-jacent.<p><p>Dans cette th`ese, nous adoptons un cadre de modélisation capable d'extraire, `a partir de séries temporelles multivariées, des caractéristiques correspondant `a des variations - covariations dynamiques communes aux variables mesurées dans tous les échantillons. Ces caractéristiques sont appelées «points communs» et une mesure qui leur est appropriée est définie. Ce qui rend le mod`ele de séries temporelles multivariées polyvalent est l'hypoth`ese relative `a l'existence de séries temporelles latentes de caractéristiques connues ou présumées et de dimensionnalité beaucoup plus faible que les séries temporelles mesurées; le résultat est le bien connu «mod`ele factoriel dynamique». Des variantes originales de méthodes existantes pour estimer le mod`ele factoriel dynamique sont développées : l'estimation est réalisée en utilisant l'équivalent du mod`ele factoriel dynamique au niveau du domaine de fréquence, désigné comme le «mod`ele factoriel spectral». Pour estimer le mod`ele factoriel spectral, nous nous basons sur des idées relatives `a la théorie des estimations spectrales. Cette théorie est utilisée pour aboutir `a une formulation probabiliste, qui fournit des estimations de probabilité maximale pour les param`etres du mod`ele factoriel spectral. Des param`etres de probabilité maximale sont alors développés, en plac cant notre analyse enti`erement dans le domaine spectral, de fac con `a ce que les séries temporelles latentes transformées dynamiquement héritent au maximum des points communs.<p><p>La principale contribution de cette th`ese consiste en un cadre d'apprentissage utilisant le mod`ele factoriel spectral. Nous désignons par apprentissage la capacité d'un mod`ele de processus `a caractériser de fac con robuste les données générées par le processus `a des fins de filtrage par motif, classification et prédiction. Dans ce contexte, le mod`ele factoriel spectral est considéré comme ayant appris une série temporelle multivariée si la série temporelle latente, une fois dynamiquement transformée, permet d'extraire les points communs de fac con fiable et maximale. Le mod`ele factoriel spectral sera utilisé principalement pour deux applications d'apprentissage de séries multivariées : en premier lieu, des ensembles de données sous forme de flux venant de différents processus du monde réel doivent ^etre classifiés; lors de cet exercice, la classification porte sur des signaux magnétoencéphalographiques obtenus chez l'homme au cours de différentes t^aches physiques et cognitives; en second lieu, les points communs obtenus sont testés en demandant une prédiction fiable d'une série temporelle multivariée étant donnée l'évolution passée; les prix d'un portefeuille d'actions sont prédits dans le cadre de ce défi.<p><p>`A la fois pour la modélisation et pour l'apprentissage factoriel spectral, une solution analytique aussi bien qu'une solution itérative sont développées. Tandis que la solution analytique est basée sur une approximation de rang inférieur de la fonction de densité spectrale, la solution itérative est basée, quant `a elle, sur l'algorithme de maximisation des attentes. Pour l'exercice de classification des signaux magnétoencéphalographiques humains, une stratégie de comparaison des similitudes entre les points communs des différentes classes de processus de séries temporelles multivariées est développée. Pour le probl`eme de prédiction des prix des actions, un mod`ele vectoriel autorégressif dont les param`etres sont enrichis avec les points communs de probabilité maximale est conc cu. Dans ces deux probl`emes d'apprentissage, le mod`ele factoriel spectral atteint des performances louables en regard d'approches concurrentes.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Today's computerized processes generate<p>massive amounts of streaming data.<p>In many applications, data is collected for modeling the processes. The process model is hoped to drive objectives such as decision support, data visualization, business intelligence, automation and control, pattern recognition and classification, etc. However, we face significant challenges in data-driven modeling of processes. Apart from the errors, outliers and noise in the data measurements, the main challenge is due to a large dimensionality, which is the number of variables each data sample measures. The samples often form a long temporal sequence called a multivariate time series where any one sample is influenced by the others.<p>We wish to build a model that will ensure robust generation, reviewing, and representation of new multivariate time series that are consistent with the underlying process.<p><p>In this thesis, we adopt a modeling framework to extract characteristics from multivariate time series that correspond to dynamic variation-covariation common to the measured variables across all the samples. Those characteristics of a multivariate time series are named its 'commonalities' and a suitable measure for them is defined. What makes the multivariate time series model versatile is the assumption regarding the existence of a latent time series of known or presumed characteristics and much lower dimensionality than the measured time series; the result is the well-known 'dynamic factor model'.<p>Original variants of existing methods for estimating the dynamic factor model are developed: The estimation is performed using the frequency-domain equivalent of the dynamic factor model named the 'spectral factor model'. To estimate the spectral factor model, ideas are sought from the asymptotic theory of spectral estimates. This theory is used to attain a probabilistic formulation, which provides maximum likelihood estimates for the spectral factor model parameters. Then, maximum likelihood parameters are developed with all the analysis entirely in the spectral-domain such that the dynamically transformed latent time series inherits the commonalities maximally.<p><p>The main contribution of this thesis is a learning framework using the spectral factor model. We term learning as the ability of a computational model of a process to robustly characterize the data the process generates for purposes of pattern matching, classification and prediction. Hence, the spectral factor model could be claimed to have learned a multivariate time series if the latent time series when dynamically transformed extracts the commonalities reliably and maximally. The spectral factor model will be used for mainly two multivariate time series learning applications: First, real-world streaming datasets obtained from various processes are to be classified; in this exercise, human brain magnetoencephalography signals obtained during various cognitive and physical tasks are classified. Second, the commonalities are put to test by asking for reliable prediction of a multivariate time series given its past evolution; share prices in a portfolio are forecasted as part of this challenge.<p><p>For both spectral factor modeling and learning, an analytical solution as well as an iterative solution are developed. While the analytical solution is based on low-rank approximation of the spectral density function, the iterative solution is based on the expectation-maximization algorithm. For the human brain signal classification exercise, a strategy for comparing similarities between the commonalities for various classes of multivariate time series processes is developed. For the share price prediction problem, a vector autoregressive model whose parameters are enriched with the maximum likelihood commonalities is designed. In both these learning problems, the spectral factor model gives commendable performance with respect to competing approaches.<p><p>Les processus informatisés actuels gén`erent des quantités massives de flux de données. Dans nombre d'applications, ces flux de données sont collectées en vue de modéliser les processus. Les mod`eles de processus obtenus ont pour but la réalisation d'objectifs tels que l'aide `a la décision, la visualisation de données, l'informatique décisionnelle, l'automatisation et le contr^ole, la reconnaissance de formes et la classification, etc. La modélisation de processus sur la base de données implique cependant de faire face `a d'importants défis. Outre les erreurs, les données aberrantes et le bruit, le principal défi provient de la large dimensionnalité, i.e. du nombre de variables dans chaque échantillon de données mesurées. Les échantillons forment souvent une longue séquence temporelle appelée série temporelle multivariée, o`u chaque échantillon est influencé par les autres. Notre objectif est de construire un mod`ele robuste qui garantisse la génération, la révision et la représentation de nouvelles séries temporelles multivariées cohérentes avec le processus sous-jacent.<p><p>Dans cette th`ese, nous adoptons un cadre de modélisation capable d'extraire, `a partir de séries temporelles multivariées, des caractéristiques correspondant `a des variations - covariations dynamiques communes aux variables mesurées dans tous les échantillons. Ces caractéristiques sont appelées «points communs» et une mesure qui leur est appropriée est définie. Ce qui rend le mod`ele de séries temporelles multivariées polyvalent est l'hypoth`ese relative `a l'existence de séries temporelles latentes de caractéristiques connues ou présumées et de dimensionnalité beaucoup plus faible que les séries temporelles mesurées; le résultat est le bien connu «mod`ele factoriel dynamique». Des variantes originales de méthodes existantes pour estimer le mod`ele factoriel dynamique sont développées : l'estimation est réalisée en utilisant l'équivalent du mod`ele factoriel dynamique au niveau du domaine de fréquence, désigné comme le «mod`ele factoriel spectral». Pour estimer le mod`ele factoriel spectral, nous nous basons sur des idées relatives `a la théorie des estimations spectrales. Cette théorie est utilisée pour aboutir `a une formulation probabiliste, qui fournit des estimations de probabilité maximale pour les param`etres du mod`ele factoriel spectral. Des param`etres de probabilité maximale sont alors développés, en plac cant notre analyse enti`erement dans le domaine spectral, de fac con `a ce que les séries temporelles latentes transformées dynamiquement héritent au maximum des points communs.<p><p>La principale contribution de cette th`ese consiste en un cadre d'apprentissage utilisant le mod`ele factoriel spectral. Nous désignons par apprentissage la capacité d'un mod`ele de processus `a caractériser de fac con robuste les données générées par le processus `a des fins de filtrage par motif, classification et prédiction. Dans ce contexte, le mod`ele factoriel spectral est considéré comme ayant appris une série temporelle multivariée si la série temporelle latente, une fois dynamiquement transformée, permet d'extraire les points communs de fac con fiable et maximale. Le mod`ele factoriel spectral sera utilisé principalement pour deux applications d'apprentissage de séries multivariées : en premier lieu, des ensembles de données sous forme de flux venant de différents processus du monde réel doivent ^etre classifiés; lors de cet exercice, la classification porte sur des signaux magnétoencéphalographiques obtenus chez l'homme au cours de différentes t^aches physiques et cognitives; en second lieu, les points communs obtenus sont testés en demandant une prédiction fiable d'une série temporelle multivariée étant donnée l'évolution passée; les prix d'un portefeuille d'actions sont prédits dans le cadre de ce défi.<p><p>`A la fois pour la modélisation et pour l'apprentissage factoriel spectral, une solution analytique aussi bien qu'une solution itérative sont développées. Tandis que la solution analytique est basée sur une approximation de rang inférieur de la fonction de densité spectrale, la solution itérative est basée, quant `a elle, sur l'algorithme de maximisation des attentes. Pour l'exercice de classification des signaux magnétoencéphalographiques humains, une stratégie de comparaison des similitudes entre les points communs des différentes classes de processus de séries temporelles multivariées est développée. Pour le probl`eme de prédiction des prix des actions, un mod`ele vectoriel autorégressif dont les param`etres sont enrichis avec les points communs de probabilité maximale est conc cu. Dans ces deux probl`emes d'apprentissage, le mod`ele factoriel spectral atteint des performances louables en regard d'approches concurrentes. |
2009
|
Caelen, Olivier Sélection séquentielle en environnement aléatoire appliquée `a l'apprentissage supervisé PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210265b,
title = {Sélection séquentielle en environnement aléatoire appliquée `a l'apprentissage supervisé},
author = {Olivier Caelen},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/210265},
year = {2009},
date = {2009-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Kontos, Kevin Gaussian graphical model selection for gene regulatory network reverse engineering and function prediction PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210301b,
title = {Gaussian graphical model selection for gene regulatory network reverse engineering and function prediction},
author = {Kevin Kontos},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/210301},
year = {2009},
date = {2009-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Borgne, Yann-A"el Le Learning in wireless sensor networks for energy-efficient environmental monitoring PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210334b,
title = {Learning in wireless sensor networks for energy-efficient environmental monitoring},
author = {Yann-A"el Le Borgne},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/210334},
year = {2009},
date = {2009-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Haibe-Kains, Benjamin Identification and assessment of gene signatures in human breast cancer PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210348b,
title = {Identification and assessment of gene signatures in human breast cancer},
author = {Benjamin Haibe-Kains},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/210348},
year = {2009},
date = {2009-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Caelen, Olivier Sélection séquentielle en environnement aléatoire appliquée `a l'apprentissage supervisé PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210265,
title = {Sélection séquentielle en environnement aléatoire appliquée `a l'apprentissage supervisé},
author = {Olivier Caelen},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/210265/1/87553dfb-5157-4fd1-b0bc-98f094979a90.txt},
year = {2009},
date = {2009-01-01},
abstract = {Cette th`ese se penche sur les probl`emes de décisions devant ^etre prises de mani`ere séquentielle au sein d'un environnement aléatoire. Lors de chaque étape d'un tel probl`eme décisionnel, une alternative doit ^etre sélectionnée parmi un ensemble d'alternatives. Chaque alternative poss`ede un gain moyen qui lui est propre et lorsque l'une d'elles est sélectionnée, celle-ci engendre un gain aléatoire. La sélection opérée peut suivre deux types d'objectifs.<p>Dans un premier cas, les tests viseront `a maximiser la somme des gains collectés. Un juste compromis doit alors ^etre trouvé entre l'exploitation et l'exploration. Ce probl`eme est couramment dénommé dans la littérature scientifique "multi-armed bandit problem".<p>Dans un second cas, un nombre de sélections maximal est imposé et l'objectif consistera `a répartir ces sélections de fac con `a augmenter les chances de trouver l'alternative présentant le gain moyen le plus élevé. Ce deuxi`eme probl`eme est couramment repris dans la littérature scientifique sous l'appellation "selecting the best".<p>La sélection de type gloutonne joue un r^ole important dans la résolution de ces probl`emes de décision et op`ere en choisissant l'alternative qui s'est jusqu'ici montrée optimale. Or, la nature généralement aléatoire de l'environnement rend incertains les résultats d'une telle sélection. <p>Dans cette th`ese, nous introduisons une nouvelle quantité, appelée le "gain espéré d'une action gloutonne". Sur base de quelques propriétés de cette quantité, de nouveaux algorithmes permettant de résoudre les deux probl`emes décisionnels précités seront proposés.<p>Une attention particuli`ere sera ici pr^etée `a l'application des techniques présentées au domaine de la sélection de mod`eles en l'apprentissage artificiel supervisé. <p>La collaboration avec le service d'anesthésie de l'H^opital Erasme nous a permis d'appliquer les algorithmes proposés `a des données réelles, provenant du milieu médical. Nous avons également développé un syst`eme d'aide `a la décision dont un prototype a déj`a été testé en conditions réelles sur un échantillon restreint de patients.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Cette th`ese se penche sur les probl`emes de décisions devant ^etre prises de mani`ere séquentielle au sein d'un environnement aléatoire. Lors de chaque étape d'un tel probl`eme décisionnel, une alternative doit ^etre sélectionnée parmi un ensemble d'alternatives. Chaque alternative poss`ede un gain moyen qui lui est propre et lorsque l'une d'elles est sélectionnée, celle-ci engendre un gain aléatoire. La sélection opérée peut suivre deux types d'objectifs.<p>Dans un premier cas, les tests viseront `a maximiser la somme des gains collectés. Un juste compromis doit alors ^etre trouvé entre l'exploitation et l'exploration. Ce probl`eme est couramment dénommé dans la littérature scientifique "multi-armed bandit problem".<p>Dans un second cas, un nombre de sélections maximal est imposé et l'objectif consistera `a répartir ces sélections de fac con `a augmenter les chances de trouver l'alternative présentant le gain moyen le plus élevé. Ce deuxi`eme probl`eme est couramment repris dans la littérature scientifique sous l'appellation "selecting the best".<p>La sélection de type gloutonne joue un r^ole important dans la résolution de ces probl`emes de décision et op`ere en choisissant l'alternative qui s'est jusqu'ici montrée optimale. Or, la nature généralement aléatoire de l'environnement rend incertains les résultats d'une telle sélection. <p>Dans cette th`ese, nous introduisons une nouvelle quantité, appelée le "gain espéré d'une action gloutonne". Sur base de quelques propriétés de cette quantité, de nouveaux algorithmes permettant de résoudre les deux probl`emes décisionnels précités seront proposés.<p>Une attention particuli`ere sera ici pr^etée `a l'application des techniques présentées au domaine de la sélection de mod`eles en l'apprentissage artificiel supervisé. <p>La collaboration avec le service d'anesthésie de l'H^opital Erasme nous a permis d'appliquer les algorithmes proposés `a des données réelles, provenant du milieu médical. Nous avons également développé un syst`eme d'aide `a la décision dont un prototype a déj`a été testé en conditions réelles sur un échantillon restreint de patients. |
Kontos, Kevin Gaussian graphical model selection for gene regulatory network reverse engineering and function prediction PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210301,
title = {Gaussian graphical model selection for gene regulatory network reverse engineering and function prediction},
author = {Kevin Kontos},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/210301/1/453ad8e7-667f-4c22-ab95-dc953d05b89d.txt},
year = {2009},
date = {2009-01-01},
abstract = {One of the most important and challenging ``knowledge extraction' tasks in bioinformatics is the reverse engineering of gene regulatory networks (GRNs) from DNA microarray gene expression data. Indeed, as a result of the development of high-throughput data-collection techniques, biology is experiencing a data flood phenomenon that pushes biologists toward a new view of biology--systems biology--that aims at system-level understanding of biological systems.<p><p>Unfortunately, even for small model organisms such as the yeast Saccharomyces cerevisiae, the number p of genes is much larger than the number n of expression data samples. The dimensionality issue induced by this ``small n, large p' data setting renders standard statistical learning methods inadequate. Restricting the complexity of the models enables to deal with this serious impediment. Indeed, by introducing (a priori undesirable) bias in the model selection procedure, one reduces the variance of the selected model thereby increasing its accuracy.<p><p>Gaussian graphical models (GGMs) have proven to be a very powerful formalism to infer GRNs from expression data. Standard GGM selection techniques can unfortunately not be used in the ``small n, large p' data setting. One way to overcome this issue is to resort to regularization. In particular, shrinkage estimators of the covariance matrix--required to infer GGMs--have proven to be very effective. Our first contribution consists in a new shrinkage estimator that improves upon existing ones through the use of a Monte Carlo (parametric bootstrap) procedure.<p><p>Another approach to GGM selection in the ``small n, large p' data setting consists in reverse engineering limited-order partial correlation graphs (q-partial correlation graphs) to approximate GGMs. Our second contribution consists in an inference algorithm, the q-nested procedure, that builds a sequence of nested q-partial correlation graphs to take advantage of the smaller order graphs' topology to infer higher order graphs. This allows us to significantly speed up the inference of such graphs and to avoid problems related to multiple testing. Consequently, we are able to consider higher order graphs, thereby increasing the accuracy of the inferred graphs.<p><p>Another important challenge in bioinformatics is the prediction of gene function. An example of such a prediction task is the identification of genes that are targets of the nitrogen catabolite repression (NCR) selection mechanism in the yeast Saccharomyces cerevisiae. The study of model organisms such as Saccharomyces cerevisiae is indispensable for the understanding of more complex organisms. Our third contribution consists in extending the standard two-class classification approach by enriching the set of variables and comparing several feature selection techniques and classification algorithms.<p><p>Finally, our fourth contribution formulates the prediction of NCR target genes as a network inference task. We use GGM selection to infer multivariate dependencies between genes, and, starting from a set of genes known to be sensitive to NCR, we classify the remaining genes. We hence avoid problems related to the choice of a negative training set and take advantage of the robustness of GGM selection techniques in the ``small n, large p' data setting.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
One of the most important and challenging ``knowledge extraction' tasks in bioinformatics is the reverse engineering of gene regulatory networks (GRNs) from DNA microarray gene expression data. Indeed, as a result of the development of high-throughput data-collection techniques, biology is experiencing a data flood phenomenon that pushes biologists toward a new view of biology--systems biology--that aims at system-level understanding of biological systems.<p><p>Unfortunately, even for small model organisms such as the yeast Saccharomyces cerevisiae, the number p of genes is much larger than the number n of expression data samples. The dimensionality issue induced by this ``small n, large p' data setting renders standard statistical learning methods inadequate. Restricting the complexity of the models enables to deal with this serious impediment. Indeed, by introducing (a priori undesirable) bias in the model selection procedure, one reduces the variance of the selected model thereby increasing its accuracy.<p><p>Gaussian graphical models (GGMs) have proven to be a very powerful formalism to infer GRNs from expression data. Standard GGM selection techniques can unfortunately not be used in the ``small n, large p' data setting. One way to overcome this issue is to resort to regularization. In particular, shrinkage estimators of the covariance matrix--required to infer GGMs--have proven to be very effective. Our first contribution consists in a new shrinkage estimator that improves upon existing ones through the use of a Monte Carlo (parametric bootstrap) procedure.<p><p>Another approach to GGM selection in the ``small n, large p' data setting consists in reverse engineering limited-order partial correlation graphs (q-partial correlation graphs) to approximate GGMs. Our second contribution consists in an inference algorithm, the q-nested procedure, that builds a sequence of nested q-partial correlation graphs to take advantage of the smaller order graphs' topology to infer higher order graphs. This allows us to significantly speed up the inference of such graphs and to avoid problems related to multiple testing. Consequently, we are able to consider higher order graphs, thereby increasing the accuracy of the inferred graphs.<p><p>Another important challenge in bioinformatics is the prediction of gene function. An example of such a prediction task is the identification of genes that are targets of the nitrogen catabolite repression (NCR) selection mechanism in the yeast Saccharomyces cerevisiae. The study of model organisms such as Saccharomyces cerevisiae is indispensable for the understanding of more complex organisms. Our third contribution consists in extending the standard two-class classification approach by enriching the set of variables and comparing several feature selection techniques and classification algorithms.<p><p>Finally, our fourth contribution formulates the prediction of NCR target genes as a network inference task. We use GGM selection to infer multivariate dependencies between genes, and, starting from a set of genes known to be sensitive to NCR, we classify the remaining genes. We hence avoid problems related to the choice of a negative training set and take advantage of the robustness of GGM selection techniques in the ``small n, large p' data setting. |
Borgne, Yann-A"el Le Learning in wireless sensor networks for energy-efficient environmental monitoring PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210334,
title = {Learning in wireless sensor networks for energy-efficient environmental monitoring},
author = {Yann-A{"e}l Le Borgne},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/210334/1/7ecacef0-cc8a-42b5-9065-5abf27a87b5e.txt},
year = {2009},
date = {2009-01-01},
abstract = {Wireless sensor networks form an emerging class of computing devices capable of observing the world with an unprecedented resolution, and promise to provide a revolutionary instrument for environmental monitoring. Such a network is composed of a collection of battery-operated wireless sensors, or sensor nodes, each of which is equipped with sensing, processing and wireless communication capabilities. Thanks to advances in microelectronics and wireless technologies, wireless sensors are small in size, and can be deployed at low cost over different kinds of environments in order to monitor both over space and time the variations of physical quantities such as temperature, humidity, light, or sound. <p><p>In environmental monitoring studies, many applications are expected to run unattended for months or years. Sensor nodes are however constrained by limited resources, particularly in terms of energy. Since communication is one order of magnitude more energy-consuming than processing, the design of data collection schemes that limit the amount of transmitted data is therefore recognized as a central issue for wireless sensor networks.<p><p>An efficient way to address this challenge is to approximate, by means of mathematical models, the evolution of the measurements taken by sensors over space and/or time. Indeed, whenever a mathematical model may be used in place of the true measurements, significant gains in communications may be obtained by only transmitting the parameters of the model instead of the set of real measurements. Since in most cases there is little or no a priori information about the variations taken by sensor measurements, the models must be identified in an automated manner. This calls for the use of machine learning techniques, which allow to model the variations of future measurements on the basis of past measurements.<p><p>This thesis brings two main contributions to the use of learning techniques in a sensor network. First, we propose an approach which combines time series prediction and model selection for reducing the amount of communication. The rationale of this approach, called adaptive model selection, is to let the sensors determine in an automated manner a prediction model that does not only fits their measurements, but that also reduces the amount of transmitted data. <p><p>The second main contribution is the design of a distributed approach for modeling sensed data, based on the principal component analysis (PCA). The proposed method allows to transform along a routing tree the measurements taken in such a way that (i) most of the variability in the measurements is retained, and (ii) the network load sustained by sensor nodes is reduced and more evenly distributed, which in turn extends the overall network lifetime. The framework can be seen as a truly distributed approach for the principal component analysis, and finds applications not only for approximated data collection tasks, but also for event detection or recognition tasks. <p><p>/<p><p>Les réseaux de capteurs sans fil forment une nouvelle famille de syst`emes informatiques permettant d'observer le monde avec une résolution sans précédent. En particulier, ces syst`emes promettent de révolutionner le domaine de l'étude environnementale. Un tel réseau est composé d'un ensemble de capteurs sans fil, ou unités sensorielles, capables de collecter, traiter, et transmettre de l'information. Gr^ace aux avancées dans les domaines de la microélectronique et des technologies sans fil, ces syst`emes sont `a la fois peu volumineux et peu co^uteux. Ceci permet leurs deploiements dans différents types d'environnements, afin d'observer l'évolution dans le temps et l'espace de quantités physiques telles que la température, l'humidité, la lumi`ere ou le son.<p><p>Dans le domaine de l'étude environnementale, les syst`emes de prise de mesures doivent souvent fonctionner de mani`ere autonome pendant plusieurs mois ou plusieurs années. Les capteurs sans fil ont cependant des ressources limitées, particuli`erement en terme d'énergie. Les communications radios étant d'un ordre de grandeur plus co^uteuses en énergie que l'utilisation du processeur, la conception de méthodes de collecte de données limitant la transmission de données est devenue l'un des principaux défis soulevés par cette technologie. <p><p>Ce défi peut ^etre abordé de mani`ere efficace par l'utilisation de mod`eles mathématiques modélisant l'évolution spatiotemporelle des mesures prises par les capteurs. En effet, si un tel mod`ele peut ^etre utilisé `a la place des mesures, d'importants gains en communications peuvent ^etre obtenus en utilisant les param`etres du mod`ele comme substitut des mesures. Cependant, dans la majorité des cas, peu ou aucune information sur la nature des mesures prises par les capteurs ne sont disponibles, et donc aucun mod`ele ne peut ^etre a priori défini. Dans ces cas, les techniques issues du domaine de l'apprentissage machine sont particuli`erement appropriées. Ces techniques ont pour but de créer ces mod`eles de fac con autonome, en anticipant les mesures `a venir sur la base des mesures passées. <p><p>Dans cette th`ese, deux contributions sont principalement apportées permettant l'applica-tion de techniques d'apprentissage machine dans le domaine des réseaux de capteurs sans fil. Premi`erement, nous proposons une approche qui combine la prédiction de série temporelle avec la sélection de mod`eles afin de réduire la communication. La logique de cette approche, appelée sélection de mod`ele adaptive, est de permettre aux unités sensorielles de determiner de mani`ere autonome un mod`ele de prédiction qui anticipe correctement leurs mesures, tout en réduisant l'utilisation de leur radio.<p><p>Deuxi`emement, nous avons conc cu une méthode permettant de modéliser de fac con distribuée les mesures collectées, qui se base sur l'analyse en composantes principales (ACP). La méthode permet de transformer les mesures le long d'un arbre de routage, de fac con `a ce que (i) la majeure partie des variations dans les mesures des capteurs soient conservées, et (ii) la charge réseau soit réduite et mieux distribuée, ce qui permet d'augmenter également la durée de vie du réseau. L'approche proposée permet de véritablement distribuer l'ACP, et peut ^etre utilisée pour des applications impliquant la collecte de données, mais également pour la détection ou la classification d'événements. <p>},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Wireless sensor networks form an emerging class of computing devices capable of observing the world with an unprecedented resolution, and promise to provide a revolutionary instrument for environmental monitoring. Such a network is composed of a collection of battery-operated wireless sensors, or sensor nodes, each of which is equipped with sensing, processing and wireless communication capabilities. Thanks to advances in microelectronics and wireless technologies, wireless sensors are small in size, and can be deployed at low cost over different kinds of environments in order to monitor both over space and time the variations of physical quantities such as temperature, humidity, light, or sound. <p><p>In environmental monitoring studies, many applications are expected to run unattended for months or years. Sensor nodes are however constrained by limited resources, particularly in terms of energy. Since communication is one order of magnitude more energy-consuming than processing, the design of data collection schemes that limit the amount of transmitted data is therefore recognized as a central issue for wireless sensor networks.<p><p>An efficient way to address this challenge is to approximate, by means of mathematical models, the evolution of the measurements taken by sensors over space and/or time. Indeed, whenever a mathematical model may be used in place of the true measurements, significant gains in communications may be obtained by only transmitting the parameters of the model instead of the set of real measurements. Since in most cases there is little or no a priori information about the variations taken by sensor measurements, the models must be identified in an automated manner. This calls for the use of machine learning techniques, which allow to model the variations of future measurements on the basis of past measurements.<p><p>This thesis brings two main contributions to the use of learning techniques in a sensor network. First, we propose an approach which combines time series prediction and model selection for reducing the amount of communication. The rationale of this approach, called adaptive model selection, is to let the sensors determine in an automated manner a prediction model that does not only fits their measurements, but that also reduces the amount of transmitted data. <p><p>The second main contribution is the design of a distributed approach for modeling sensed data, based on the principal component analysis (PCA). The proposed method allows to transform along a routing tree the measurements taken in such a way that (i) most of the variability in the measurements is retained, and (ii) the network load sustained by sensor nodes is reduced and more evenly distributed, which in turn extends the overall network lifetime. The framework can be seen as a truly distributed approach for the principal component analysis, and finds applications not only for approximated data collection tasks, but also for event detection or recognition tasks. <p><p>/<p><p>Les réseaux de capteurs sans fil forment une nouvelle famille de syst`emes informatiques permettant d'observer le monde avec une résolution sans précédent. En particulier, ces syst`emes promettent de révolutionner le domaine de l'étude environnementale. Un tel réseau est composé d'un ensemble de capteurs sans fil, ou unités sensorielles, capables de collecter, traiter, et transmettre de l'information. Gr^ace aux avancées dans les domaines de la microélectronique et des technologies sans fil, ces syst`emes sont `a la fois peu volumineux et peu co^uteux. Ceci permet leurs deploiements dans différents types d'environnements, afin d'observer l'évolution dans le temps et l'espace de quantités physiques telles que la température, l'humidité, la lumi`ere ou le son.<p><p>Dans le domaine de l'étude environnementale, les syst`emes de prise de mesures doivent souvent fonctionner de mani`ere autonome pendant plusieurs mois ou plusieurs années. Les capteurs sans fil ont cependant des ressources limitées, particuli`erement en terme d'énergie. Les communications radios étant d'un ordre de grandeur plus co^uteuses en énergie que l'utilisation du processeur, la conception de méthodes de collecte de données limitant la transmission de données est devenue l'un des principaux défis soulevés par cette technologie. <p><p>Ce défi peut ^etre abordé de mani`ere efficace par l'utilisation de mod`eles mathématiques modélisant l'évolution spatiotemporelle des mesures prises par les capteurs. En effet, si un tel mod`ele peut ^etre utilisé `a la place des mesures, d'importants gains en communications peuvent ^etre obtenus en utilisant les param`etres du mod`ele comme substitut des mesures. Cependant, dans la majorité des cas, peu ou aucune information sur la nature des mesures prises par les capteurs ne sont disponibles, et donc aucun mod`ele ne peut ^etre a priori défini. Dans ces cas, les techniques issues du domaine de l'apprentissage machine sont particuli`erement appropriées. Ces techniques ont pour but de créer ces mod`eles de fac con autonome, en anticipant les mesures `a venir sur la base des mesures passées. <p><p>Dans cette th`ese, deux contributions sont principalement apportées permettant l'applica-tion de techniques d'apprentissage machine dans le domaine des réseaux de capteurs sans fil. Premi`erement, nous proposons une approche qui combine la prédiction de série temporelle avec la sélection de mod`eles afin de réduire la communication. La logique de cette approche, appelée sélection de mod`ele adaptive, est de permettre aux unités sensorielles de determiner de mani`ere autonome un mod`ele de prédiction qui anticipe correctement leurs mesures, tout en réduisant l'utilisation de leur radio.<p><p>Deuxi`emement, nous avons conc cu une méthode permettant de modéliser de fac con distribuée les mesures collectées, qui se base sur l'analyse en composantes principales (ACP). La méthode permet de transformer les mesures le long d'un arbre de routage, de fac con `a ce que (i) la majeure partie des variations dans les mesures des capteurs soient conservées, et (ii) la charge réseau soit réduite et mieux distribuée, ce qui permet d'augmenter également la durée de vie du réseau. L'approche proposée permet de véritablement distribuer l'ACP, et peut ^etre utilisée pour des applications impliquant la collecte de données, mais également pour la détection ou la classification d'événements. <p> |
Haibe-Kains, Benjamin Identification and assessment of gene signatures in human breast cancer PhD Thesis 2009, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210348,
title = {Identification and assessment of gene signatures in human breast cancer},
author = {Benjamin Haibe-Kains},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/210348/1/0edb320b-c02a-49c0-bb87-8165a5d231ec.txt},
year = {2009},
date = {2009-01-01},
abstract = {This thesis addresses the use of machine learning techniques to develop clinical diagnostic tools for breast cancer using molecular data. These tools are designed to assist physicians in their evaluation of the clinical outcome of breast cancer (referred to as prognosis).<p>The traditional approach to evaluating breast cancer prognosis is based on the assessment of clinico-pathologic factors known to be associated with breast cancer survival. These factors are used to make recommendations about whether further treatment is required after the removal of a tumor by surgery. Treatment such as chemotherapy depends on the estimation of patients' risk of relapse. Although current approaches do provide good prognostic assessment of breast cancer survival, clinicians are aware that there is still room for improvement in the accuracy of their prognostic estimations.<p>In the late nineties, new high throughput technologies such as the gene expression profiling through microarray technology emerged. Microarrays allowed scientists to analyze for the first time the expression of the whole human genome ("transcriptome"). It was hoped that the analysis of genome-wide molecular data would bring new insights into the critical, underlying biological mechanisms involved in breast cancer progression, as well as significantly improve prognostic prediction. However, the analysis of microarray data is a difficult task due to their intrinsic characteristics: (i) thousands of gene expressions are measured for only few samples; (ii) the measurements are usually "noisy"; and (iii) they are highly correlated due to gene co-expressions. Since traditional statistical methods were not adapted to these settings, machine learning methods were picked up as good candidates to overcome these difficulties. However, applying machine learning methods for microarray analysis involves numerous steps, and the results are prone to overfitting. Several authors have highlighted the major pitfalls of this process in the early publications, shedding new light on the promising but overoptimistic results. <p>Since 2002, large comparative studies have been conducted in order to identify the key characteristics of successful methods for class discovery and classification. Yet methods able to identify robust molecular signatures that can predict breast cancer prognosis have been lacking. To fill this important gap, this thesis presents an original methodology dealing specifically with the analysis of microarray and survival data in order to build prognostic models and provide an honest estimation of their performance. The approach used for signature extraction consists of a set of original methods for feature transformation, feature selection and prediction model building. A novel statistical framework is presented for performance assessment and comparison of risk prediction models.<p>In terms of applications, we show that these methods, used in combination with a priori biological knowledge of breast cancer and numerous public microarray datasets, have resulted in some important discoveries. In particular, the research presented here develops (i) a robust model for the identification of breast molecular subtypes and (ii) a new prognostic model that takes into account the molecular heterogeneity of breast cancers observed previously, in order to improve traditional clinical guidelines and state-of-the-art gene signatures./Cette th`ese concerne le développement de techniques d'apprentissage (machine learning) afin de mettre au point de nouveaux outils cliniques basés sur des données moleculaires. Nous avons focalisé notre recherche sur le cancer du sein, un des cancers les plus fréquemment diagnostiqués. Ces outils sont développés dans le but d'aider les médecins dans leur évaluation du devenir clinique des patients cancéreux (cf. le pronostique).<p>Les approches traditionnelles d'évaluation du pronostique d'un patient cancéreux se base sur des crit`eres clinico-pathologiques connus pour ^etre prédictifs de la survie. Cette évaluation permet aux médecins de décider si un traitement est nécessaire apr`es l'extraction de la tumeur. Bien que les outils d'évaluation traditionnels sont d'une aide importante, les cliniciens sont conscients de la nécessité d'améliorer de tels outils.<p>Dans les années 90, de nouvelles technologies `a haut-débit, telles que le profilage de l'expression génique par biopuces `a ADN (microarrays), ont été mises au point afin de permettre aux scientifiques d'analyser l'expression de l'enti`ereté du gén^ome de cellules cancéreuses. Ce nouveau type de données moléculaires porte l'espoir d'améliorer les outils pronostiques traditionnels et d'approfondir nos connaissances concernant la gén`ese du cancer du sein. Cependant ces données sont extr^emement difficiles `a analyser `a cause (i) de leur haute dimensionalité (plusieurs dizaines de milliers de g`enes pour seulement quelques centaines d'expériences); (ii) du bruit important dans les mesures; (iii) de la collinéarité entre les mesures d^ue `a la co-expression des g`enes.<p>Depuis 2002, des études comparatives `a grande échelle ont permis d'identifier les méthodes performantes pour l'analyse de groupements et la classification de données microarray, négligeant l'analyse de survie pertinente pour le pronostique dans le cancer du sein. Pour pallier ce manque, cette th`ese présente une méthodologie originale adaptée `a l'analyse de données microarray et de survie afin de construire des mod`eles pronostiques performants et robustes. <p>En termes d'applications, nous montrons que cette méthodologie, utilisée en combinaison avec des connaissances biologiques a priori et de nombreux ensembles de données publiques, a permis d'importantes découvertes. En particulier, il résulte de la recherche presentée dans cette th`ese, le développement d'un mod`ele robuste d'identification des sous-types moléculaires du cancer du sein et de plusieurs signatures géniques améliorant significativement l'état de l'art au niveau pronostique.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
This thesis addresses the use of machine learning techniques to develop clinical diagnostic tools for breast cancer using molecular data. These tools are designed to assist physicians in their evaluation of the clinical outcome of breast cancer (referred to as prognosis).<p>The traditional approach to evaluating breast cancer prognosis is based on the assessment of clinico-pathologic factors known to be associated with breast cancer survival. These factors are used to make recommendations about whether further treatment is required after the removal of a tumor by surgery. Treatment such as chemotherapy depends on the estimation of patients' risk of relapse. Although current approaches do provide good prognostic assessment of breast cancer survival, clinicians are aware that there is still room for improvement in the accuracy of their prognostic estimations.<p>In the late nineties, new high throughput technologies such as the gene expression profiling through microarray technology emerged. Microarrays allowed scientists to analyze for the first time the expression of the whole human genome ("transcriptome"). It was hoped that the analysis of genome-wide molecular data would bring new insights into the critical, underlying biological mechanisms involved in breast cancer progression, as well as significantly improve prognostic prediction. However, the analysis of microarray data is a difficult task due to their intrinsic characteristics: (i) thousands of gene expressions are measured for only few samples; (ii) the measurements are usually "noisy"; and (iii) they are highly correlated due to gene co-expressions. Since traditional statistical methods were not adapted to these settings, machine learning methods were picked up as good candidates to overcome these difficulties. However, applying machine learning methods for microarray analysis involves numerous steps, and the results are prone to overfitting. Several authors have highlighted the major pitfalls of this process in the early publications, shedding new light on the promising but overoptimistic results. <p>Since 2002, large comparative studies have been conducted in order to identify the key characteristics of successful methods for class discovery and classification. Yet methods able to identify robust molecular signatures that can predict breast cancer prognosis have been lacking. To fill this important gap, this thesis presents an original methodology dealing specifically with the analysis of microarray and survival data in order to build prognostic models and provide an honest estimation of their performance. The approach used for signature extraction consists of a set of original methods for feature transformation, feature selection and prediction model building. A novel statistical framework is presented for performance assessment and comparison of risk prediction models.<p>In terms of applications, we show that these methods, used in combination with a priori biological knowledge of breast cancer and numerous public microarray datasets, have resulted in some important discoveries. In particular, the research presented here develops (i) a robust model for the identification of breast molecular subtypes and (ii) a new prognostic model that takes into account the molecular heterogeneity of breast cancers observed previously, in order to improve traditional clinical guidelines and state-of-the-art gene signatures./Cette th`ese concerne le développement de techniques d'apprentissage (machine learning) afin de mettre au point de nouveaux outils cliniques basés sur des données moleculaires. Nous avons focalisé notre recherche sur le cancer du sein, un des cancers les plus fréquemment diagnostiqués. Ces outils sont développés dans le but d'aider les médecins dans leur évaluation du devenir clinique des patients cancéreux (cf. le pronostique).<p>Les approches traditionnelles d'évaluation du pronostique d'un patient cancéreux se base sur des crit`eres clinico-pathologiques connus pour ^etre prédictifs de la survie. Cette évaluation permet aux médecins de décider si un traitement est nécessaire apr`es l'extraction de la tumeur. Bien que les outils d'évaluation traditionnels sont d'une aide importante, les cliniciens sont conscients de la nécessité d'améliorer de tels outils.<p>Dans les années 90, de nouvelles technologies `a haut-débit, telles que le profilage de l'expression génique par biopuces `a ADN (microarrays), ont été mises au point afin de permettre aux scientifiques d'analyser l'expression de l'enti`ereté du gén^ome de cellules cancéreuses. Ce nouveau type de données moléculaires porte l'espoir d'améliorer les outils pronostiques traditionnels et d'approfondir nos connaissances concernant la gén`ese du cancer du sein. Cependant ces données sont extr^emement difficiles `a analyser `a cause (i) de leur haute dimensionalité (plusieurs dizaines de milliers de g`enes pour seulement quelques centaines d'expériences); (ii) du bruit important dans les mesures; (iii) de la collinéarité entre les mesures d^ue `a la co-expression des g`enes.<p>Depuis 2002, des études comparatives `a grande échelle ont permis d'identifier les méthodes performantes pour l'analyse de groupements et la classification de données microarray, négligeant l'analyse de survie pertinente pour le pronostique dans le cancer du sein. Pour pallier ce manque, cette th`ese présente une méthodologie originale adaptée `a l'analyse de données microarray et de survie afin de construire des mod`eles pronostiques performants et robustes. <p>En termes d'applications, nous montrons que cette méthodologie, utilisée en combinaison avec des connaissances biologiques a priori et de nombreux ensembles de données publiques, a permis d'importantes découvertes. En particulier, il résulte de la recherche presentée dans cette th`ese, le développement d'un mod`ele robuste d'identification des sous-types moléculaires du cancer du sein et de plusieurs signatures géniques améliorant significativement l'état de l'art au niveau pronostique. |
2008
|
Meyer, Patrick E. Information-theoretic variable selection and network inference from microarray data PhD Thesis 2008, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210396b,
title = {Information-theoretic variable selection and network inference from microarray data},
author = {Patrick E. Meyer},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/210396},
year = {2008},
date = {2008-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Meyer, Patrick E Information-theoretic variable selection and network inference from microarray data PhD Thesis 2008, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/210396,
title = {Information-theoretic variable selection and network inference from microarray data},
author = {Patrick E Meyer},
url = {https://dipot.ulb.ac.be/dspace/bitstream/2013/210396/1/baf3a39e-3c11-496d-8b3b-b952a1827ca0.txt},
year = {2008},
date = {2008-01-01},
abstract = {Statisticians are used to model interactions between variables on the basis of observed<p>data. In a lot of emerging fields, like bioinformatics, they are confronted with datasets<p>having thousands of variables, a lot of noise, non-linear dependencies and, only, tens of<p>samples. The detection of functional relationships, when such uncertainty is contained in<p>data, constitutes a major challenge.<p>Our work focuses on variable selection and network inference from datasets having<p>many variables and few samples (high variable-to-sample ratio), such as microarray data.<p>Variable selection is the topic of machine learning whose objective is to select, among a<p>set of input variables, those that lead to the best predictive model. The application of<p>variable selection methods to gene expression data allows, for example, to improve cancer<p>diagnosis and prognosis by identifying a new molecular signature of the disease. Network<p>inference consists in representing the dependencies between the variables of a dataset by<p>a graph. Hence, when applied to microarray data, network inference can reverse-engineer<p>the transcriptional regulatory network of cell in view of discovering new drug targets to<p>cure diseases.<p>In this work, two original tools are proposed MASSIVE (Matrix of Average Sub-Subset<p>Information for Variable Elimination) a new method of feature selection and MRNET (Minimum<p>Redundancy NETwork), a new algorithm of network inference. Both tools rely on<p>the computation of mutual information, an information-theoretic measure of dependency.<p>More precisely, MASSIVE and MRNET use approximations of the mutual information<p>between a subset of variables and a target variable based on combinations of mutual informations<p>between sub-subsets of variables and the target. The used approximations allow<p>to estimate a series of low variate densities instead of one large multivariate density. Low<p>variate densities are well-suited for dealing with high variable-to-sample ratio datasets,<p>since they are rather cheap in terms of computational cost and they do not require a large<p>amount of samples in order to be estimated accurately. Numerous experimental results<p>show the competitiveness of these new approaches. Finally, our thesis has led to a freely<p>available source code of MASSIVE and an open-source R and Bioconductor package of<p>network inference.},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Statisticians are used to model interactions between variables on the basis of observed<p>data. In a lot of emerging fields, like bioinformatics, they are confronted with datasets<p>having thousands of variables, a lot of noise, non-linear dependencies and, only, tens of<p>samples. The detection of functional relationships, when such uncertainty is contained in<p>data, constitutes a major challenge.<p>Our work focuses on variable selection and network inference from datasets having<p>many variables and few samples (high variable-to-sample ratio), such as microarray data.<p>Variable selection is the topic of machine learning whose objective is to select, among a<p>set of input variables, those that lead to the best predictive model. The application of<p>variable selection methods to gene expression data allows, for example, to improve cancer<p>diagnosis and prognosis by identifying a new molecular signature of the disease. Network<p>inference consists in representing the dependencies between the variables of a dataset by<p>a graph. Hence, when applied to microarray data, network inference can reverse-engineer<p>the transcriptional regulatory network of cell in view of discovering new drug targets to<p>cure diseases.<p>In this work, two original tools are proposed MASSIVE (Matrix of Average Sub-Subset<p>Information for Variable Elimination) a new method of feature selection and MRNET (Minimum<p>Redundancy NETwork), a new algorithm of network inference. Both tools rely on<p>the computation of mutual information, an information-theoretic measure of dependency.<p>More precisely, MASSIVE and MRNET use approximations of the mutual information<p>between a subset of variables and a target variable based on combinations of mutual informations<p>between sub-subsets of variables and the target. The used approximations allow<p>to estimate a series of low variate densities instead of one large multivariate density. Low<p>variate densities are well-suited for dealing with high variable-to-sample ratio datasets,<p>since they are rather cheap in terms of computational cost and they do not require a large<p>amount of samples in order to be estimated accurately. Numerous experimental results<p>show the competitiveness of these new approaches. Finally, our thesis has led to a freely<p>available source code of MASSIVE and an open-source R and Bioconductor package of<p>network inference. |
2000
|
Bontempi, Gianluca Local learning techniques for modeling, prediction and control PhD Thesis 2000, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/211823b,
title = {Local learning techniques for modeling, prediction and control},
author = {Gianluca Bontempi},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/211823},
year = {2000},
date = {2000-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|
Bontempi, Gianluca Local learning techniques for modeling, prediction and control PhD Thesis 2000, (Funder: Universite Libre de Bruxelles). @phdthesis{info:hdl:2013/211823,
title = {Local learning techniques for modeling, prediction and control},
author = {Gianluca Bontempi},
url = {http://hdl.handle.net/2013/ULB-DIPOT:oai:dipot.ulb.ac.be:2013/211823},
year = {2000},
date = {2000-01-01},
note = {Funder: Universite Libre de Bruxelles},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
|